[processors] add tests for helper fn (#39629)
* add tests for helpers * duplicate test for each model * why llava next video has no helper * oops must have been in the commit * fix test after rebase * add copy from
This commit is contained in:
committed by
GitHub
parent
6638b3642d
commit
8b237b8639
@@ -515,8 +515,8 @@ class AriaImageProcessor(BaseImageProcessor):
|
|||||||
Returns:
|
Returns:
|
||||||
`int`: Number of patches per image.
|
`int`: Number of patches per image.
|
||||||
"""
|
"""
|
||||||
split_image = images_kwargs.get("split_image", None) or self.split_image
|
split_image = images_kwargs["split_image"] if "split_image" in images_kwargs else self.split_image
|
||||||
max_image_size = images_kwargs.get("max_image_size", None) or self.max_image_size
|
max_image_size = images_kwargs["max_image_size"] if "max_image_size" in images_kwargs else self.max_image_size
|
||||||
|
|
||||||
resized_height, resized_width = select_best_resolution((height, width), self.split_resolutions)
|
resized_height, resized_width = select_best_resolution((height, width), self.split_resolutions)
|
||||||
num_patches = 1 if not split_image else resized_height // max_image_size * resized_width // max_image_size
|
num_patches = 1 if not split_image else resized_height // max_image_size * resized_width // max_image_size
|
||||||
|
|||||||
@@ -901,8 +901,8 @@ class AriaImageProcessor(BaseImageProcessor):
|
|||||||
Returns:
|
Returns:
|
||||||
`int`: Number of patches per image.
|
`int`: Number of patches per image.
|
||||||
"""
|
"""
|
||||||
split_image = images_kwargs.get("split_image", None) or self.split_image
|
split_image = images_kwargs["split_image"] if "split_image" in images_kwargs else self.split_image
|
||||||
max_image_size = images_kwargs.get("max_image_size", None) or self.max_image_size
|
max_image_size = images_kwargs["max_image_size"] if "max_image_size" in images_kwargs else self.max_image_size
|
||||||
|
|
||||||
resized_height, resized_width = select_best_resolution((height, width), self.split_resolutions)
|
resized_height, resized_width = select_best_resolution((height, width), self.split_resolutions)
|
||||||
num_patches = 1 if not split_image else resized_height // max_image_size * resized_width // max_image_size
|
num_patches = 1 if not split_image else resized_height // max_image_size * resized_width // max_image_size
|
||||||
|
|||||||
@@ -264,9 +264,8 @@ class ColPaliProcessor(ProcessorMixin):
|
|||||||
image_sizes (list[list[str]], *optional*):
|
image_sizes (list[list[str]], *optional*):
|
||||||
The input sizes formatted as (height, width) per each image.
|
The input sizes formatted as (height, width) per each image.
|
||||||
Returns:
|
Returns:
|
||||||
dict[str, list[int]]: A dictionary mapping each modality ("image", "video", "audio")
|
`MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
|
||||||
to a list containing the number of placeholder tokens required. If the model doesn't accept
|
input modalities, along with other useful data.
|
||||||
a certain modality or no input sizes are provided, the dict value is set to an empty list.
|
|
||||||
"""
|
"""
|
||||||
vision_data = {}
|
vision_data = {}
|
||||||
if image_sizes is not None:
|
if image_sizes is not None:
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ from transformers.models.colpali.processing_colpali import ColPaliProcessor
|
|||||||
from ...cache_utils import Cache
|
from ...cache_utils import Cache
|
||||||
from ...feature_extraction_utils import BatchFeature
|
from ...feature_extraction_utils import BatchFeature
|
||||||
from ...image_utils import ImageInput, is_valid_image
|
from ...image_utils import ImageInput, is_valid_image
|
||||||
from ...processing_utils import ProcessingKwargs, Unpack
|
from ...processing_utils import MultiModalData, ProcessingKwargs, Unpack
|
||||||
from ...tokenization_utils_base import PreTokenizedInput, TextInput
|
from ...tokenization_utils_base import PreTokenizedInput, TextInput
|
||||||
from ...utils import ModelOutput, auto_docstring, can_return_tuple, is_torch_available, logging
|
from ...utils import ModelOutput, auto_docstring, can_return_tuple, is_torch_available, logging
|
||||||
from .configuration_colqwen2 import ColQwen2Config
|
from .configuration_colqwen2 import ColQwen2Config
|
||||||
@@ -224,6 +224,32 @@ class ColQwen2Processor(ColPaliProcessor):
|
|||||||
|
|
||||||
return batch_query
|
return batch_query
|
||||||
|
|
||||||
|
def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs):
|
||||||
|
"""
|
||||||
|
Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
|
||||||
|
Args:
|
||||||
|
image_sizes (`list[list[int]]`, *optional*):
|
||||||
|
The input sizes formatted as (height, width) per each image.
|
||||||
|
Returns:
|
||||||
|
`MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
|
||||||
|
input modalities, along with other useful data.
|
||||||
|
"""
|
||||||
|
|
||||||
|
vision_data = {}
|
||||||
|
if image_sizes is not None:
|
||||||
|
images_kwargs = ColQwen2ProcessorKwargs._defaults.get("images_kwargs", {})
|
||||||
|
images_kwargs.update(kwargs)
|
||||||
|
merge_size = images_kwargs.get("merge_size", None) or self.image_processor.merge_size
|
||||||
|
|
||||||
|
num_image_patches = [
|
||||||
|
self.image_processor.get_number_of_image_patches(*image_size, images_kwargs)
|
||||||
|
for image_size in image_sizes
|
||||||
|
]
|
||||||
|
num_image_tokens = [(num_patches // merge_size**2) for num_patches in num_image_patches]
|
||||||
|
vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
|
||||||
|
|
||||||
|
return MultiModalData(**vision_data)
|
||||||
|
|
||||||
|
|
||||||
class ColQwen2PreTrainedModel(ColPaliPreTrainedModel):
|
class ColQwen2PreTrainedModel(ColPaliPreTrainedModel):
|
||||||
pass
|
pass
|
||||||
|
|||||||
@@ -226,20 +226,27 @@ class ColQwen2Processor(ProcessorMixin):
|
|||||||
def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs):
|
def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs):
|
||||||
"""
|
"""
|
||||||
Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
|
Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
image_sizes (list[list[str]], *optional*):
|
image_sizes (`list[list[int]]`, *optional*):
|
||||||
The input sizes formatted as (height, width) per each image.
|
The input sizes formatted as (height, width) per each image.
|
||||||
Returns:
|
Returns:
|
||||||
dict[str, list[int]]: A dictionary mapping each modality ("image", "video", "audio")
|
`MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
|
||||||
to a list containing the number of placeholder tokens required. If the model doesn't accept
|
input modalities, along with other useful data.
|
||||||
a certain modality or no input sizes are provided, the dict value is set to an empty list.
|
|
||||||
"""
|
"""
|
||||||
|
|
||||||
vision_data = {}
|
vision_data = {}
|
||||||
if image_sizes is not None:
|
if image_sizes is not None:
|
||||||
num_image_tokens = [self.image_seq_length] * len(image_sizes)
|
images_kwargs = ColQwen2ProcessorKwargs._defaults.get("images_kwargs", {})
|
||||||
num_image_patches = [1] * len(image_sizes)
|
images_kwargs.update(kwargs)
|
||||||
|
merge_size = images_kwargs.get("merge_size", None) or self.image_processor.merge_size
|
||||||
|
|
||||||
|
num_image_patches = [
|
||||||
|
self.image_processor.get_number_of_image_patches(*image_size, images_kwargs)
|
||||||
|
for image_size in image_sizes
|
||||||
|
]
|
||||||
|
num_image_tokens = [(num_patches // merge_size**2) for num_patches in num_image_patches]
|
||||||
vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
|
vision_data.update({"num_image_tokens": num_image_tokens, "num_image_patches": num_image_patches})
|
||||||
|
|
||||||
return MultiModalData(**vision_data)
|
return MultiModalData(**vision_data)
|
||||||
|
|
||||||
def batch_decode(self, *args, **kwargs):
|
def batch_decode(self, *args, **kwargs):
|
||||||
|
|||||||
@@ -449,8 +449,8 @@ class Glm4vImageProcessor(BaseImageProcessor):
|
|||||||
Returns:
|
Returns:
|
||||||
`int`: Number of image patches per image.
|
`int`: Number of image patches per image.
|
||||||
"""
|
"""
|
||||||
patch_size = images_kwargs.get("patch_size", None) or self.patch_size
|
patch_size = images_kwargs["patch_size"] if "patch_size" in images_kwargs else self.patch_size
|
||||||
merge_size = images_kwargs.get("merge_size", None) or self.merge_size
|
merge_size = images_kwargs["merge_size"] if "merge_size" in images_kwargs else self.merge_size
|
||||||
|
|
||||||
factor = patch_size * merge_size
|
factor = patch_size * merge_size
|
||||||
resized_height, resized_width = smart_resize(
|
resized_height, resized_width = smart_resize(
|
||||||
|
|||||||
@@ -505,10 +505,12 @@ class GotOcr2ImageProcessor(BaseImageProcessor):
|
|||||||
Returns:
|
Returns:
|
||||||
`int`: Number of patches per image.
|
`int`: Number of patches per image.
|
||||||
"""
|
"""
|
||||||
min_patches = images_kwargs.get("min_patches", None) or self.min_patches
|
min_patches = images_kwargs["min_patches"] if "min_patches" in images_kwargs else self.min_patches
|
||||||
max_patches = images_kwargs.get("max_patches", None) or self.max_patches
|
max_patches = images_kwargs["max_patches"] if "max_patches" in images_kwargs else self.max_patches
|
||||||
patch_size = images_kwargs.get("size", None) or self.size
|
patch_size = images_kwargs["patch_size"] if "patch_size" in images_kwargs else self.size
|
||||||
crop_to_patches = images_kwargs.get("crop_to_patches", None) or self.crop_to_patches
|
crop_to_patches = (
|
||||||
|
images_kwargs["crop_to_patches"] if "crop_to_patches" in images_kwargs else self.crop_to_patches
|
||||||
|
)
|
||||||
|
|
||||||
num_patches = 1
|
num_patches = 1
|
||||||
if crop_to_patches and max_patches > 1:
|
if crop_to_patches and max_patches > 1:
|
||||||
|
|||||||
@@ -223,7 +223,7 @@ class GotOcr2ImageProcessorFast(BaseImageProcessorFast):
|
|||||||
data={"pixel_values": processed_images, "num_patches": num_patches}, tensor_type=return_tensors
|
data={"pixel_values": processed_images, "num_patches": num_patches}, tensor_type=return_tensors
|
||||||
)
|
)
|
||||||
|
|
||||||
def get_number_of_image_tokens(self, height: int, width: int, images_kwargs=None):
|
def get_number_of_image_patches(self, height: int, width: int, images_kwargs=None):
|
||||||
"""
|
"""
|
||||||
A utility that returns number patches for a given image size.
|
A utility that returns number patches for a given image size.
|
||||||
|
|
||||||
@@ -237,10 +237,12 @@ class GotOcr2ImageProcessorFast(BaseImageProcessorFast):
|
|||||||
Returns:
|
Returns:
|
||||||
`int`: Number of patches per image.
|
`int`: Number of patches per image.
|
||||||
"""
|
"""
|
||||||
min_patches = images_kwargs.get("min_patches", None) or self.min_patches
|
min_patches = images_kwargs["min_patches"] if "min_patches" in images_kwargs else self.min_patches
|
||||||
max_patches = images_kwargs.get("max_patches", None) or self.max_patches
|
max_patches = images_kwargs["max_patches"] if "max_patches" in images_kwargs else self.max_patches
|
||||||
patch_size = images_kwargs.get("size", None) or self.size
|
patch_size = images_kwargs["patch_size"] if "patch_size" in images_kwargs else self.size
|
||||||
crop_to_patches = images_kwargs.get("crop_to_patches", None) or self.crop_to_patches
|
crop_to_patches = (
|
||||||
|
images_kwargs["crop_to_patches"] if "crop_to_patches" in images_kwargs else self.crop_to_patches
|
||||||
|
)
|
||||||
|
|
||||||
num_patches = 1
|
num_patches = 1
|
||||||
if crop_to_patches and max_patches > 1:
|
if crop_to_patches and max_patches > 1:
|
||||||
|
|||||||
@@ -866,9 +866,11 @@ class Idefics3ImageProcessor(BaseImageProcessor):
|
|||||||
Returns:
|
Returns:
|
||||||
`int`: Number of patches per image.
|
`int`: Number of patches per image.
|
||||||
"""
|
"""
|
||||||
do_image_splitting = images_kwargs.get("do_image_splitting", None) or self.do_image_splitting
|
do_image_splitting = (
|
||||||
max_image_size = images_kwargs.get("max_image_size", None) or self.max_image_size
|
images_kwargs["do_image_splitting"] if "do_image_splitting" in images_kwargs else self.do_image_splitting
|
||||||
size = images_kwargs.get("size", None) or self.size
|
)
|
||||||
|
max_image_size = images_kwargs["max_image_size"] if "max_image_size" in images_kwargs else self.max_image_size
|
||||||
|
size = images_kwargs["size"] if "size" in images_kwargs else self.size
|
||||||
|
|
||||||
num_patches = num_rows = num_cols = 1
|
num_patches = num_rows = num_cols = 1
|
||||||
if do_image_splitting:
|
if do_image_splitting:
|
||||||
|
|||||||
@@ -514,9 +514,11 @@ class Idefics3ImageProcessorFast(BaseImageProcessorFast):
|
|||||||
Returns:
|
Returns:
|
||||||
`int`: Number of patches per image.
|
`int`: Number of patches per image.
|
||||||
"""
|
"""
|
||||||
do_image_splitting = images_kwargs.get("do_image_splitting", None) or self.do_image_splitting
|
do_image_splitting = (
|
||||||
max_image_size = images_kwargs.get("max_image_size", None) or self.max_image_size
|
images_kwargs["do_image_splitting"] if "do_image_splitting" in images_kwargs else self.do_image_splitting
|
||||||
size = images_kwargs.get("size", None) or self.size
|
)
|
||||||
|
max_image_size = images_kwargs["max_image_size"] if "max_image_size" in images_kwargs else self.max_image_size
|
||||||
|
size = images_kwargs["size"] if "size" in images_kwargs else self.size
|
||||||
|
|
||||||
num_patches = num_rows = num_cols = 1
|
num_patches = num_rows = num_cols = 1
|
||||||
if do_image_splitting:
|
if do_image_splitting:
|
||||||
|
|||||||
@@ -284,7 +284,7 @@ class InternVLProcessor(ProcessorMixin):
|
|||||||
images_kwargs.update(kwargs)
|
images_kwargs.update(kwargs)
|
||||||
|
|
||||||
num_image_patches = [
|
num_image_patches = [
|
||||||
self.image_processor.get_number_of_image_tokens(*image_size, images_kwargs)
|
self.image_processor.get_number_of_image_patches(*image_size, images_kwargs)
|
||||||
for image_size in image_sizes
|
for image_size in image_sizes
|
||||||
]
|
]
|
||||||
# Add 2 for BOI and EOI tokens
|
# Add 2 for BOI and EOI tokens
|
||||||
|
|||||||
@@ -231,14 +231,9 @@ class LlavaNextProcessor(ProcessorMixin):
|
|||||||
Args:
|
Args:
|
||||||
image_sizes (list[list[str]], *optional*):
|
image_sizes (list[list[str]], *optional*):
|
||||||
The input sizes formatted as (height, width) per each image.
|
The input sizes formatted as (height, width) per each image.
|
||||||
video_sizes (list[list[str]], *optional*):
|
|
||||||
The input sizes formatted as (num_frames, height, width) per each video.
|
|
||||||
audio_lengths (list[int], *optional*):
|
|
||||||
The input length formatted as per each audio.
|
|
||||||
Returns:
|
Returns:
|
||||||
dict[str, list[int]]: A dictionary mapping each modality ("image", "video", "audio")
|
`MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
|
||||||
to a list containing the number of placeholder tokens required. If the model doesn't accept
|
input modalities, along with other useful data.
|
||||||
a certain modality or no input sizes are provided, the dict value is set to an empty list.
|
|
||||||
"""
|
"""
|
||||||
vision_data = {}
|
vision_data = {}
|
||||||
if image_sizes is not None:
|
if image_sizes is not None:
|
||||||
|
|||||||
@@ -23,7 +23,7 @@ import numpy as np
|
|||||||
from ...feature_extraction_utils import BatchFeature
|
from ...feature_extraction_utils import BatchFeature
|
||||||
from ...image_processing_utils import select_best_resolution
|
from ...image_processing_utils import select_best_resolution
|
||||||
from ...image_utils import ImageInput, get_image_size, to_numpy_array
|
from ...image_utils import ImageInput, get_image_size, to_numpy_array
|
||||||
from ...processing_utils import ProcessingKwargs, ProcessorMixin, Unpack
|
from ...processing_utils import MultiModalData, ProcessingKwargs, ProcessorMixin, Unpack
|
||||||
from ...tokenization_utils_base import PreTokenizedInput, TextInput
|
from ...tokenization_utils_base import PreTokenizedInput, TextInput
|
||||||
from ...utils import logging
|
from ...utils import logging
|
||||||
from ...video_utils import VideoInput
|
from ...video_utils import VideoInput
|
||||||
@@ -265,6 +265,43 @@ class LlavaNextVideoProcessor(ProcessorMixin):
|
|||||||
newline_features = current_height
|
newline_features = current_height
|
||||||
return (unpadded_features, newline_features)
|
return (unpadded_features, newline_features)
|
||||||
|
|
||||||
|
def _get_num_multimodal_tokens(self, image_sizes=None, **kwargs):
|
||||||
|
"""
|
||||||
|
Computes the number of placeholder tokens needed for multimodal inputs with the given sizes.
|
||||||
|
Args:
|
||||||
|
image_sizes (list[list[str]], *optional*):
|
||||||
|
The input sizes formatted as (height, width) per each image.
|
||||||
|
Returns:
|
||||||
|
`MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
|
||||||
|
input modalities, along with other useful data.
|
||||||
|
"""
|
||||||
|
vision_data = {}
|
||||||
|
if image_sizes is not None:
|
||||||
|
images_kwargs = LlavaNextVideoProcessorKwargs._defaults.get("images_kwargs", {})
|
||||||
|
images_kwargs.update(kwargs)
|
||||||
|
|
||||||
|
size = images_kwargs.get("size", None) or self.image_processor.size
|
||||||
|
size = (
|
||||||
|
(size["shortest_edge"], size["shortest_edge"])
|
||||||
|
if "shortest_edge" in size
|
||||||
|
else (min(size["height"], size["width"]), min(size["height"], size["width"]))
|
||||||
|
)
|
||||||
|
processed_height, processed_width = size
|
||||||
|
|
||||||
|
batch_num_image_tokens = []
|
||||||
|
num_image_patches = [1] * len(image_sizes) # llava-next doesn't batch pixels as Idefics, thus `1` patch`
|
||||||
|
for image_size in image_sizes:
|
||||||
|
orig_height, orig_width = image_size
|
||||||
|
num_image_tokens = self._get_number_of_features(
|
||||||
|
orig_height, orig_width, processed_height, processed_width
|
||||||
|
)
|
||||||
|
if self.vision_feature_select_strategy == "default":
|
||||||
|
num_image_tokens -= 1
|
||||||
|
batch_num_image_tokens.append(num_image_tokens)
|
||||||
|
vision_data.update({"num_image_tokens": batch_num_image_tokens, "num_image_patches": num_image_patches})
|
||||||
|
|
||||||
|
return MultiModalData(**vision_data)
|
||||||
|
|
||||||
# Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
|
# Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
|
||||||
def batch_decode(self, *args, **kwargs):
|
def batch_decode(self, *args, **kwargs):
|
||||||
"""
|
"""
|
||||||
|
|||||||
@@ -327,9 +327,8 @@ class PaliGemmaProcessor(ProcessorMixin):
|
|||||||
image_sizes (list[list[str]], *optional*):
|
image_sizes (list[list[str]], *optional*):
|
||||||
The input sizes formatted as (height, width) per each image.
|
The input sizes formatted as (height, width) per each image.
|
||||||
Returns:
|
Returns:
|
||||||
dict[str, list[int]]: A dictionary mapping each modality ("image", "video", "audio")
|
`MultiModalData`: A `MultiModalData` object holding number of tokens per each of the provided
|
||||||
to a list containing the number of placeholder tokens required. If the model doesn't accept
|
input modalities, along with other useful data.
|
||||||
a certain modality or no input sizes are provided, the dict value is set to an empty list.
|
|
||||||
"""
|
"""
|
||||||
vision_data = {}
|
vision_data = {}
|
||||||
if image_sizes is not None:
|
if image_sizes is not None:
|
||||||
|
|||||||
@@ -502,10 +502,10 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
|
|||||||
Returns:
|
Returns:
|
||||||
`int`: Number of image patches per image.
|
`int`: Number of image patches per image.
|
||||||
"""
|
"""
|
||||||
min_pixels = images_kwargs.get("min_pixels", None) or self.size["shortest_edge"]
|
min_pixels = images_kwargs["min_pixels"] if "min_pixels" in images_kwargs else self.size["shortest_edge"]
|
||||||
max_pixels = images_kwargs.get("max_pixels", None) or self.size["longest_edge"]
|
max_pixels = images_kwargs["max_pixels"] if "max_pixels" in images_kwargs else self.size["longest_edge"]
|
||||||
patch_size = images_kwargs.get("patch_size", None) or self.patch_size
|
patch_size = images_kwargs["patch_size"] if "patch_size" in images_kwargs else self.patch_size
|
||||||
merge_size = images_kwargs.get("merge_size", None) or self.merge_size
|
merge_size = images_kwargs["merge_size"] if "merge_size" in images_kwargs else self.merge_size
|
||||||
|
|
||||||
factor = patch_size * merge_size
|
factor = patch_size * merge_size
|
||||||
resized_height, resized_width = smart_resize(
|
resized_height, resized_width = smart_resize(
|
||||||
|
|||||||
@@ -299,10 +299,10 @@ class Qwen2VLImageProcessorFast(BaseImageProcessorFast):
|
|||||||
Returns:
|
Returns:
|
||||||
`int`: Number of image patches per image.
|
`int`: Number of image patches per image.
|
||||||
"""
|
"""
|
||||||
min_pixels = images_kwargs.get("min_pixels", None) or self.size["shortest_edge"]
|
min_pixels = images_kwargs["min_pixels"] if "min_pixels" in images_kwargs else self.size["shortest_edge"]
|
||||||
max_pixels = images_kwargs.get("max_pixels", None) or self.size["longest_edge"]
|
max_pixels = images_kwargs["max_pixels"] if "max_pixels" in images_kwargs else self.size["longest_edge"]
|
||||||
patch_size = images_kwargs.get("patch_size", None) or self.patch_size
|
patch_size = images_kwargs["patch_size"] if "patch_size" in images_kwargs else self.patch_size
|
||||||
merge_size = images_kwargs.get("merge_size", None) or self.merge_size
|
merge_size = images_kwargs["merge_size"] if "merge_size" in images_kwargs else self.merge_size
|
||||||
|
|
||||||
factor = patch_size * merge_size
|
factor = patch_size * merge_size
|
||||||
resized_height, resized_width = smart_resize(
|
resized_height, resized_width = smart_resize(
|
||||||
|
|||||||
@@ -863,9 +863,11 @@ class SmolVLMImageProcessor(BaseImageProcessor):
|
|||||||
Returns:
|
Returns:
|
||||||
`int`: Number of patches per image.
|
`int`: Number of patches per image.
|
||||||
"""
|
"""
|
||||||
do_image_splitting = images_kwargs.get("do_image_splitting", None) or self.do_image_splitting
|
do_image_splitting = (
|
||||||
max_image_size = images_kwargs.get("max_image_size", None) or self.max_image_size
|
images_kwargs["do_image_splitting"] if "do_image_splitting" in images_kwargs else self.do_image_splitting
|
||||||
size = images_kwargs.get("size", None) or self.size
|
)
|
||||||
|
max_image_size = images_kwargs["max_image_size"] if "max_image_size" in images_kwargs else self.max_image_size
|
||||||
|
size = images_kwargs["size"] if "size" in images_kwargs else self.size
|
||||||
|
|
||||||
num_patches = num_rows = num_cols = 1
|
num_patches = num_rows = num_cols = 1
|
||||||
if do_image_splitting:
|
if do_image_splitting:
|
||||||
|
|||||||
@@ -504,9 +504,11 @@ class SmolVLMImageProcessorFast(BaseImageProcessorFast):
|
|||||||
Returns:
|
Returns:
|
||||||
`int`: Number of patches per image.
|
`int`: Number of patches per image.
|
||||||
"""
|
"""
|
||||||
do_image_splitting = images_kwargs.get("do_image_splitting", None) or self.do_image_splitting
|
do_image_splitting = (
|
||||||
max_image_size = images_kwargs.get("max_image_size", None) or self.max_image_size
|
images_kwargs["do_image_splitting"] if "do_image_splitting" in images_kwargs else self.do_image_splitting
|
||||||
size = images_kwargs.get("size", None) or self.size
|
)
|
||||||
|
max_image_size = images_kwargs["max_image_size"] if "max_image_size" in images_kwargs else self.max_image_size
|
||||||
|
size = images_kwargs["size"] if "size" in images_kwargs else self.size
|
||||||
|
|
||||||
num_patches = num_rows = num_cols = 1
|
num_patches = num_rows = num_cols = 1
|
||||||
if do_image_splitting:
|
if do_image_splitting:
|
||||||
|
|||||||
@@ -302,3 +302,19 @@ class AriaImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
encoded_images.shape[:-1] if input_data_format == ChannelDimension.LAST else encoded_images.shape[1:]
|
encoded_images.shape[:-1] if input_data_format == ChannelDimension.LAST else encoded_images.shape[1:]
|
||||||
)
|
)
|
||||||
self.assertEqual(encoded_image_shape, image_shape)
|
self.assertEqual(encoded_image_shape, image_shape)
|
||||||
|
|
||||||
|
def test_get_num_patches_without_images(self):
|
||||||
|
for image_processing_class in self.image_processor_list:
|
||||||
|
image_processing = image_processing_class(**self.image_processor_dict)
|
||||||
|
num_patches = image_processing.get_number_of_image_patches(height=100, width=100, images_kwargs={})
|
||||||
|
self.assertEqual(num_patches, 1)
|
||||||
|
|
||||||
|
num_patches = image_processing.get_number_of_image_patches(
|
||||||
|
height=300, width=500, images_kwargs={"split_image": True}
|
||||||
|
)
|
||||||
|
self.assertEqual(num_patches, 1)
|
||||||
|
|
||||||
|
num_patches = image_processing.get_number_of_image_patches(
|
||||||
|
height=100, width=100, images_kwargs={"split_image": True, "max_image_size": 200}
|
||||||
|
)
|
||||||
|
self.assertEqual(num_patches, 19)
|
||||||
|
|||||||
@@ -95,6 +95,19 @@ class AriaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
def tearDownClass(cls):
|
def tearDownClass(cls):
|
||||||
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
|
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
|
||||||
|
|
||||||
|
# Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens
|
||||||
|
def test_get_num_vision_tokens(self):
|
||||||
|
"Tests general functionality of the helper used internally in vLLM"
|
||||||
|
|
||||||
|
processor = self.get_processor()
|
||||||
|
|
||||||
|
output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
|
||||||
|
self.assertTrue("num_image_tokens" in output)
|
||||||
|
self.assertEqual(len(output["num_image_tokens"]), 3)
|
||||||
|
|
||||||
|
self.assertTrue("num_image_patches" in output)
|
||||||
|
self.assertEqual(len(output["num_image_patches"]), 3)
|
||||||
|
|
||||||
def test_process_interleaved_images_prompts_image_splitting(self):
|
def test_process_interleaved_images_prompts_image_splitting(self):
|
||||||
processor = self.get_processor()
|
processor = self.get_processor()
|
||||||
processor.image_processor.split_image = True
|
processor.image_processor.split_image = True
|
||||||
|
|||||||
@@ -80,6 +80,19 @@ class AyaVisionProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
def tearDownClass(cls):
|
def tearDownClass(cls):
|
||||||
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
|
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
|
||||||
|
|
||||||
|
# Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens
|
||||||
|
def test_get_num_vision_tokens(self):
|
||||||
|
"Tests general functionality of the helper used internally in vLLM"
|
||||||
|
|
||||||
|
processor = self.get_processor()
|
||||||
|
|
||||||
|
output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
|
||||||
|
self.assertTrue("num_image_tokens" in output)
|
||||||
|
self.assertEqual(len(output["num_image_tokens"]), 3)
|
||||||
|
|
||||||
|
self.assertTrue("num_image_patches" in output)
|
||||||
|
self.assertEqual(len(output["num_image_patches"]), 3)
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
def test_process_interleaved_images_videos(self):
|
def test_process_interleaved_images_videos(self):
|
||||||
processor = self.get_processor()
|
processor = self.get_processor()
|
||||||
|
|||||||
@@ -74,3 +74,16 @@ class ChameleonProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
@staticmethod
|
@staticmethod
|
||||||
def prepare_processor_dict():
|
def prepare_processor_dict():
|
||||||
return {"image_seq_length": 2} # fmt: skip
|
return {"image_seq_length": 2} # fmt: skip
|
||||||
|
|
||||||
|
# Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens
|
||||||
|
def test_get_num_vision_tokens(self):
|
||||||
|
"Tests general functionality of the helper used internally in vLLM"
|
||||||
|
|
||||||
|
processor = self.get_processor()
|
||||||
|
|
||||||
|
output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
|
||||||
|
self.assertTrue("num_image_tokens" in output)
|
||||||
|
self.assertEqual(len(output["num_image_tokens"]), 3)
|
||||||
|
|
||||||
|
self.assertTrue("num_image_patches" in output)
|
||||||
|
self.assertEqual(len(output["num_image_patches"]), 3)
|
||||||
|
|||||||
@@ -54,6 +54,19 @@ class ColPaliProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
def tearDownClass(cls):
|
def tearDownClass(cls):
|
||||||
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
|
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
|
||||||
|
|
||||||
|
# Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens
|
||||||
|
def test_get_num_vision_tokens(self):
|
||||||
|
"Tests general functionality of the helper used internally in vLLM"
|
||||||
|
|
||||||
|
processor = self.get_processor()
|
||||||
|
|
||||||
|
output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
|
||||||
|
self.assertTrue("num_image_tokens" in output)
|
||||||
|
self.assertEqual(len(output["num_image_tokens"]), 3)
|
||||||
|
|
||||||
|
self.assertTrue("num_image_patches" in output)
|
||||||
|
self.assertEqual(len(output["num_image_patches"]), 3)
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
@require_vision
|
@require_vision
|
||||||
def test_process_images(self):
|
def test_process_images(self):
|
||||||
|
|||||||
@@ -57,6 +57,19 @@ class ColQwen2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
def tearDownClass(cls):
|
def tearDownClass(cls):
|
||||||
shutil.rmtree(cls.tmpdirname)
|
shutil.rmtree(cls.tmpdirname)
|
||||||
|
|
||||||
|
# Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens
|
||||||
|
def test_get_num_vision_tokens(self):
|
||||||
|
"Tests general functionality of the helper used internally in vLLM"
|
||||||
|
|
||||||
|
processor = self.get_processor()
|
||||||
|
|
||||||
|
output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
|
||||||
|
self.assertTrue("num_image_tokens" in output)
|
||||||
|
self.assertEqual(len(output["num_image_tokens"]), 3)
|
||||||
|
|
||||||
|
self.assertTrue("num_image_patches" in output)
|
||||||
|
self.assertEqual(len(output["num_image_patches"]), 3)
|
||||||
|
|
||||||
def test_process_images(self):
|
def test_process_images(self):
|
||||||
# Processor configuration
|
# Processor configuration
|
||||||
image_input = self.prepare_image_inputs()
|
image_input = self.prepare_image_inputs()
|
||||||
|
|||||||
@@ -90,3 +90,16 @@ class Emu3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
|
|
||||||
# For an image where pixels go from 0 to 255 the diff can be 1 due to some numerical precision errors when scaling and unscaling
|
# For an image where pixels go from 0 to 255 the diff can be 1 due to some numerical precision errors when scaling and unscaling
|
||||||
self.assertTrue(np.abs(orig_image - unnormalized_images).max() >= 1)
|
self.assertTrue(np.abs(orig_image - unnormalized_images).max() >= 1)
|
||||||
|
|
||||||
|
# Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens
|
||||||
|
def test_get_num_vision_tokens(self):
|
||||||
|
"Tests general functionality of the helper used internally in vLLM"
|
||||||
|
|
||||||
|
processor = self.get_processor()
|
||||||
|
|
||||||
|
output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
|
||||||
|
self.assertTrue("num_image_tokens" in output)
|
||||||
|
self.assertEqual(len(output["num_image_tokens"]), 3)
|
||||||
|
|
||||||
|
self.assertTrue("num_image_patches" in output)
|
||||||
|
self.assertEqual(len(output["num_image_patches"]), 3)
|
||||||
|
|||||||
@@ -64,6 +64,19 @@ class FuyuProcessingTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
def get_image_processor(self, **kwargs):
|
def get_image_processor(self, **kwargs):
|
||||||
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
|
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
|
||||||
|
|
||||||
|
# Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens
|
||||||
|
def test_get_num_vision_tokens(self):
|
||||||
|
"Tests general functionality of the helper used internally in vLLM"
|
||||||
|
|
||||||
|
processor = self.get_processor()
|
||||||
|
|
||||||
|
output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
|
||||||
|
self.assertTrue("num_image_tokens" in output)
|
||||||
|
self.assertEqual(len(output["num_image_tokens"]), 3)
|
||||||
|
|
||||||
|
self.assertTrue("num_image_patches" in output)
|
||||||
|
self.assertEqual(len(output["num_image_patches"]), 3)
|
||||||
|
|
||||||
def test_fuyu_processing(self):
|
def test_fuyu_processing(self):
|
||||||
"""
|
"""
|
||||||
Test to ensure that the standard processing on a gold example matches adept's code.
|
Test to ensure that the standard processing on a gold example matches adept's code.
|
||||||
|
|||||||
@@ -58,6 +58,19 @@ class Gemma3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
processor.save_pretrained(cls.tmpdirname)
|
processor.save_pretrained(cls.tmpdirname)
|
||||||
cls.image_token = processor.boi_token
|
cls.image_token = processor.boi_token
|
||||||
|
|
||||||
|
# Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens
|
||||||
|
def test_get_num_vision_tokens(self):
|
||||||
|
"Tests general functionality of the helper used internally in vLLM"
|
||||||
|
|
||||||
|
processor = self.get_processor()
|
||||||
|
|
||||||
|
output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
|
||||||
|
self.assertTrue("num_image_tokens" in output)
|
||||||
|
self.assertEqual(len(output["num_image_tokens"]), 3)
|
||||||
|
|
||||||
|
self.assertTrue("num_image_patches" in output)
|
||||||
|
self.assertEqual(len(output["num_image_patches"]), 3)
|
||||||
|
|
||||||
@classmethod
|
@classmethod
|
||||||
def tearDownClass(cls):
|
def tearDownClass(cls):
|
||||||
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
|
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
|
||||||
|
|||||||
@@ -169,3 +169,24 @@ class GotOcr2ProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
)
|
)
|
||||||
self.assertEqual(len(processed_images[0]), 5)
|
self.assertEqual(len(processed_images[0]), 5)
|
||||||
self.assertEqual(processed_images.shape[-2:], (20, 20))
|
self.assertEqual(processed_images.shape[-2:], (20, 20))
|
||||||
|
|
||||||
|
def test_get_num_patches_without_images(self):
|
||||||
|
for image_processing_class in self.image_processor_list:
|
||||||
|
image_processing = image_processing_class(**self.image_processor_dict)
|
||||||
|
num_patches = image_processing.get_number_of_image_patches(height=100, width=100, images_kwargs={})
|
||||||
|
self.assertEqual(num_patches, 1)
|
||||||
|
|
||||||
|
num_patches = image_processing.get_number_of_image_patches(
|
||||||
|
height=300, width=500, images_kwargs={"crop_to_patches": False}
|
||||||
|
)
|
||||||
|
self.assertEqual(num_patches, 1)
|
||||||
|
|
||||||
|
num_patches = image_processing.get_number_of_image_patches(
|
||||||
|
height=100, width=100, images_kwargs={"crop_to_patches": True}
|
||||||
|
)
|
||||||
|
self.assertEqual(num_patches, 10)
|
||||||
|
|
||||||
|
num_patches = image_processing.get_number_of_image_patches(
|
||||||
|
height=100, width=100, images_kwargs={"crop_to_patches": True, "max_patches": 200}
|
||||||
|
)
|
||||||
|
self.assertEqual(num_patches, 50)
|
||||||
|
|||||||
@@ -358,3 +358,28 @@ class Idefics3ImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
)
|
)
|
||||||
self.assertEqual(encoding_slow.rows, encoding_fast.rows)
|
self.assertEqual(encoding_slow.rows, encoding_fast.rows)
|
||||||
self.assertEqual(encoding_slow.cols, encoding_fast.cols)
|
self.assertEqual(encoding_slow.cols, encoding_fast.cols)
|
||||||
|
|
||||||
|
def test_get_num_patches_without_images(self):
|
||||||
|
for image_processing_class in self.image_processor_list:
|
||||||
|
image_processing = image_processing_class(**self.image_processor_dict)
|
||||||
|
num_patches_and_row_cols = image_processing.get_number_of_image_patches(
|
||||||
|
height=100, width=100, images_kwargs={}
|
||||||
|
)
|
||||||
|
self.assertEqual(num_patches_and_row_cols, (5, 2, 2))
|
||||||
|
|
||||||
|
num_patches_and_row_cols = image_processing.get_number_of_image_patches(
|
||||||
|
height=300, width=500, images_kwargs={"do_image_splitting": False}
|
||||||
|
)
|
||||||
|
self.assertEqual(num_patches_and_row_cols, (1, 1, 1))
|
||||||
|
|
||||||
|
num_patches_and_row_cols = image_processing.get_number_of_image_patches(
|
||||||
|
height=300, width=500, images_kwargs={"do_image_splitting": True}
|
||||||
|
)
|
||||||
|
self.assertEqual(num_patches_and_row_cols, (5, 2, 2))
|
||||||
|
|
||||||
|
num_patches_and_row_cols = image_processing.get_number_of_image_patches(
|
||||||
|
height=300,
|
||||||
|
width=600,
|
||||||
|
images_kwargs={"do_image_splitting": True, "max_image_size": {"longest_edge": 30}},
|
||||||
|
)
|
||||||
|
self.assertEqual(num_patches_and_row_cols, (3, 1, 2))
|
||||||
|
|||||||
@@ -84,6 +84,19 @@ class Idefics3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
def prepare_processor_dict():
|
def prepare_processor_dict():
|
||||||
return {"image_seq_len": 2}
|
return {"image_seq_len": 2}
|
||||||
|
|
||||||
|
# Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens
|
||||||
|
def test_get_num_vision_tokens(self):
|
||||||
|
"Tests general functionality of the helper used internally in vLLM"
|
||||||
|
|
||||||
|
processor = self.get_processor()
|
||||||
|
|
||||||
|
output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
|
||||||
|
self.assertTrue("num_image_tokens" in output)
|
||||||
|
self.assertEqual(len(output["num_image_tokens"]), 3)
|
||||||
|
|
||||||
|
self.assertTrue("num_image_patches" in output)
|
||||||
|
self.assertEqual(len(output["num_image_patches"]), 3)
|
||||||
|
|
||||||
def get_split_image_expected_tokens(self, processor, image_rows, image_cols):
|
def get_split_image_expected_tokens(self, processor, image_rows, image_cols):
|
||||||
text_split_images = []
|
text_split_images = []
|
||||||
for n_h in range(image_rows):
|
for n_h in range(image_rows):
|
||||||
|
|||||||
@@ -97,6 +97,19 @@ class InternVLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
def tearDownClass(cls):
|
def tearDownClass(cls):
|
||||||
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
|
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
|
||||||
|
|
||||||
|
# Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens
|
||||||
|
def test_get_num_vision_tokens(self):
|
||||||
|
"Tests general functionality of the helper used internally in vLLM"
|
||||||
|
|
||||||
|
processor = self.get_processor()
|
||||||
|
|
||||||
|
output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
|
||||||
|
self.assertTrue("num_image_tokens" in output)
|
||||||
|
self.assertEqual(len(output["num_image_tokens"]), 3)
|
||||||
|
|
||||||
|
self.assertTrue("num_image_patches" in output)
|
||||||
|
self.assertEqual(len(output["num_image_patches"]), 3)
|
||||||
|
|
||||||
@require_av
|
@require_av
|
||||||
@require_torch
|
@require_torch
|
||||||
def test_process_interleaved_images_videos(self):
|
def test_process_interleaved_images_videos(self):
|
||||||
|
|||||||
@@ -61,6 +61,18 @@ class LlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
"vision_feature_select_strategy": "default"
|
"vision_feature_select_strategy": "default"
|
||||||
} # fmt: skip
|
} # fmt: skip
|
||||||
|
|
||||||
|
def test_get_num_vision_tokens(self):
|
||||||
|
"Tests general functionality of the helper used internally in vLLM"
|
||||||
|
|
||||||
|
processor = self.get_processor()
|
||||||
|
|
||||||
|
output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
|
||||||
|
self.assertTrue("num_image_tokens" in output)
|
||||||
|
self.assertEqual(len(output["num_image_tokens"]), 3)
|
||||||
|
|
||||||
|
self.assertTrue("num_image_patches" in output)
|
||||||
|
self.assertEqual(len(output["num_image_patches"]), 3)
|
||||||
|
|
||||||
def test_chat_template_is_saved(self):
|
def test_chat_template_is_saved(self):
|
||||||
processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)
|
processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)
|
||||||
processor_dict_loaded = json.loads(processor_loaded.to_json_string())
|
processor_dict_loaded = json.loads(processor_loaded.to_json_string())
|
||||||
|
|||||||
@@ -66,6 +66,19 @@ class LlavaNextProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
"vision_feature_select_strategy": "default"
|
"vision_feature_select_strategy": "default"
|
||||||
} # fmt: skip
|
} # fmt: skip
|
||||||
|
|
||||||
|
# Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens
|
||||||
|
def test_get_num_vision_tokens(self):
|
||||||
|
"Tests general functionality of the helper used internally in vLLM"
|
||||||
|
|
||||||
|
processor = self.get_processor()
|
||||||
|
|
||||||
|
output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
|
||||||
|
self.assertTrue("num_image_tokens" in output)
|
||||||
|
self.assertEqual(len(output["num_image_tokens"]), 3)
|
||||||
|
|
||||||
|
self.assertTrue("num_image_patches" in output)
|
||||||
|
self.assertEqual(len(output["num_image_patches"]), 3)
|
||||||
|
|
||||||
# Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_chat_template_is_saved
|
# Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_chat_template_is_saved
|
||||||
def test_chat_template_is_saved(self):
|
def test_chat_template_is_saved(self):
|
||||||
processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)
|
processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)
|
||||||
|
|||||||
@@ -75,6 +75,19 @@ class LlavaNextVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
"vision_feature_select_strategy": "default",
|
"vision_feature_select_strategy": "default",
|
||||||
}
|
}
|
||||||
|
|
||||||
|
# Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens
|
||||||
|
def test_get_num_vision_tokens(self):
|
||||||
|
"Tests general functionality of the helper used internally in vLLM"
|
||||||
|
|
||||||
|
processor = self.get_processor()
|
||||||
|
|
||||||
|
output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
|
||||||
|
self.assertTrue("num_image_tokens" in output)
|
||||||
|
self.assertEqual(len(output["num_image_tokens"]), 3)
|
||||||
|
|
||||||
|
self.assertTrue("num_image_patches" in output)
|
||||||
|
self.assertEqual(len(output["num_image_patches"]), 3)
|
||||||
|
|
||||||
# Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_chat_template_is_saved
|
# Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_chat_template_is_saved
|
||||||
def test_chat_template_is_saved(self):
|
def test_chat_template_is_saved(self):
|
||||||
processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)
|
processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)
|
||||||
|
|||||||
@@ -79,6 +79,19 @@ class LlavaOnevisionProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
"vision_feature_select_strategy": "default"
|
"vision_feature_select_strategy": "default"
|
||||||
} # fmt: skip
|
} # fmt: skip
|
||||||
|
|
||||||
|
# Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens
|
||||||
|
def test_get_num_vision_tokens(self):
|
||||||
|
"Tests general functionality of the helper used internally in vLLM"
|
||||||
|
|
||||||
|
processor = self.get_processor()
|
||||||
|
|
||||||
|
output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
|
||||||
|
self.assertTrue("num_image_tokens" in output)
|
||||||
|
self.assertEqual(len(output["num_image_tokens"]), 3)
|
||||||
|
|
||||||
|
self.assertTrue("num_image_patches" in output)
|
||||||
|
self.assertEqual(len(output["num_image_patches"]), 3)
|
||||||
|
|
||||||
# Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_chat_template_is_saved
|
# Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_chat_template_is_saved
|
||||||
def test_chat_template_is_saved(self):
|
def test_chat_template_is_saved(self):
|
||||||
processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)
|
processor_loaded = self.processor_class.from_pretrained(self.tmpdirname)
|
||||||
|
|||||||
@@ -48,6 +48,19 @@ class PaliGemmaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
def tearDownClass(cls):
|
def tearDownClass(cls):
|
||||||
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
|
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
|
||||||
|
|
||||||
|
# Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens
|
||||||
|
def test_get_num_vision_tokens(self):
|
||||||
|
"Tests general functionality of the helper used internally in vLLM"
|
||||||
|
|
||||||
|
processor = self.get_processor()
|
||||||
|
|
||||||
|
output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
|
||||||
|
self.assertTrue("num_image_tokens" in output)
|
||||||
|
self.assertEqual(len(output["num_image_tokens"]), 3)
|
||||||
|
|
||||||
|
self.assertTrue("num_image_patches" in output)
|
||||||
|
self.assertEqual(len(output["num_image_patches"]), 3)
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
@require_vision
|
@require_vision
|
||||||
def test_image_seq_length(self):
|
def test_image_seq_length(self):
|
||||||
|
|||||||
@@ -65,6 +65,19 @@ class Qwen2_5_VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
def tearDownClass(cls):
|
def tearDownClass(cls):
|
||||||
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
|
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
|
||||||
|
|
||||||
|
# Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens
|
||||||
|
def test_get_num_vision_tokens(self):
|
||||||
|
"Tests general functionality of the helper used internally in vLLM"
|
||||||
|
|
||||||
|
processor = self.get_processor()
|
||||||
|
|
||||||
|
output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
|
||||||
|
self.assertTrue("num_image_tokens" in output)
|
||||||
|
self.assertEqual(len(output["num_image_tokens"]), 3)
|
||||||
|
|
||||||
|
self.assertTrue("num_image_patches" in output)
|
||||||
|
self.assertEqual(len(output["num_image_patches"]), 3)
|
||||||
|
|
||||||
def test_save_load_pretrained_default(self):
|
def test_save_load_pretrained_default(self):
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
image_processor = self.get_image_processor()
|
image_processor = self.get_image_processor()
|
||||||
|
|||||||
@@ -394,3 +394,17 @@ class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
self._assert_slow_fast_tensors_equivalence(
|
self._assert_slow_fast_tensors_equivalence(
|
||||||
encoding_slow.image_grid_thw.float(), encoding_fast.image_grid_thw.float()
|
encoding_slow.image_grid_thw.float(), encoding_fast.image_grid_thw.float()
|
||||||
)
|
)
|
||||||
|
|
||||||
|
def test_get_num_patches_without_images(self):
|
||||||
|
for image_processing_class in self.image_processor_list:
|
||||||
|
image_processing = image_processing_class(**self.image_processor_dict)
|
||||||
|
num_patches = image_processing.get_number_of_image_patches(height=100, width=100, images_kwargs={})
|
||||||
|
self.assertEqual(num_patches, 64)
|
||||||
|
|
||||||
|
num_patches = image_processing.get_number_of_image_patches(height=200, width=50, images_kwargs={})
|
||||||
|
self.assertEqual(num_patches, 56)
|
||||||
|
|
||||||
|
num_patches = image_processing.get_number_of_image_patches(
|
||||||
|
height=100, width=100, images_kwargs={"patch_size": 28}
|
||||||
|
)
|
||||||
|
self.assertEqual(num_patches, 16)
|
||||||
|
|||||||
@@ -68,6 +68,19 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
def tearDownClass(cls):
|
def tearDownClass(cls):
|
||||||
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
|
shutil.rmtree(cls.tmpdirname, ignore_errors=True)
|
||||||
|
|
||||||
|
# Copied from tests.models.llava.test_processor_llava.LlavaProcessorTest.test_get_num_vision_tokens
|
||||||
|
def test_get_num_vision_tokens(self):
|
||||||
|
"Tests general functionality of the helper used internally in vLLM"
|
||||||
|
|
||||||
|
processor = self.get_processor()
|
||||||
|
|
||||||
|
output = processor._get_num_multimodal_tokens(image_sizes=[(100, 100), (300, 100), (500, 30)])
|
||||||
|
self.assertTrue("num_image_tokens" in output)
|
||||||
|
self.assertEqual(len(output["num_image_tokens"]), 3)
|
||||||
|
|
||||||
|
self.assertTrue("num_image_patches" in output)
|
||||||
|
self.assertEqual(len(output["num_image_patches"]), 3)
|
||||||
|
|
||||||
def test_save_load_pretrained_default(self):
|
def test_save_load_pretrained_default(self):
|
||||||
tokenizer = self.get_tokenizer()
|
tokenizer = self.get_tokenizer()
|
||||||
image_processor = self.get_image_processor()
|
image_processor = self.get_image_processor()
|
||||||
|
|||||||
@@ -358,3 +358,28 @@ class SmolVLMImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
)
|
)
|
||||||
self.assertEqual(encoding_slow.rows, encoding_fast.rows)
|
self.assertEqual(encoding_slow.rows, encoding_fast.rows)
|
||||||
self.assertEqual(encoding_slow.cols, encoding_fast.cols)
|
self.assertEqual(encoding_slow.cols, encoding_fast.cols)
|
||||||
|
|
||||||
|
def test_get_num_patches_without_images(self):
|
||||||
|
for image_processing_class in self.image_processor_list:
|
||||||
|
image_processing = image_processing_class(**self.image_processor_dict)
|
||||||
|
num_patches_and_row_cols = image_processing.get_number_of_image_patches(
|
||||||
|
height=100, width=100, images_kwargs={}
|
||||||
|
)
|
||||||
|
self.assertEqual(num_patches_and_row_cols, (5, 2, 2))
|
||||||
|
|
||||||
|
num_patches_and_row_cols = image_processing.get_number_of_image_patches(
|
||||||
|
height=300, width=500, images_kwargs={"do_image_splitting": False}
|
||||||
|
)
|
||||||
|
self.assertEqual(num_patches_and_row_cols, (1, 1, 1))
|
||||||
|
|
||||||
|
num_patches_and_row_cols = image_processing.get_number_of_image_patches(
|
||||||
|
height=300, width=500, images_kwargs={"do_image_splitting": True}
|
||||||
|
)
|
||||||
|
self.assertEqual(num_patches_and_row_cols, (5, 2, 2))
|
||||||
|
|
||||||
|
num_patches_and_row_cols = image_processing.get_number_of_image_patches(
|
||||||
|
height=300,
|
||||||
|
width=600,
|
||||||
|
images_kwargs={"do_image_splitting": True, "max_image_size": {"longest_edge": 30}},
|
||||||
|
)
|
||||||
|
self.assertEqual(num_patches_and_row_cols, (3, 1, 2))
|
||||||
|
|||||||
Reference in New Issue
Block a user