Fix Qwen2VL processor to handle odd number of frames (#35431)
* fix: processing odd number of frames * feat: add test case * update: test one frame * feat: support custom patch size * fix: test with videos * revert: change on patch repeat * fix: much wow * update: fixups * fixup pls * ruff fixup * fix typo at least
This commit is contained in:
@@ -291,8 +291,9 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
|
|||||||
patches = np.array(processed_images)
|
patches = np.array(processed_images)
|
||||||
if data_format == ChannelDimension.LAST:
|
if data_format == ChannelDimension.LAST:
|
||||||
patches = patches.transpose(0, 3, 1, 2)
|
patches = patches.transpose(0, 3, 1, 2)
|
||||||
if patches.shape[0] == 1:
|
if patches.shape[0] % self.temporal_patch_size != 0:
|
||||||
patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1))
|
repeats = np.repeat(patches[-1][np.newaxis], self.temporal_patch_size - 1, axis=0)
|
||||||
|
patches = np.concatenate([patches, repeats], axis=0)
|
||||||
channel = patches.shape[1]
|
channel = patches.shape[1]
|
||||||
grid_t = patches.shape[0] // self.temporal_patch_size
|
grid_t = patches.shape[0] // self.temporal_patch_size
|
||||||
grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
|
grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size
|
||||||
|
|||||||
@@ -22,7 +22,7 @@ from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
|
|||||||
from transformers.testing_utils import require_torch, require_vision
|
from transformers.testing_utils import require_torch, require_vision
|
||||||
from transformers.utils import is_torch_available, is_vision_available
|
from transformers.utils import is_torch_available, is_vision_available
|
||||||
|
|
||||||
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs
|
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs, prepare_video_inputs
|
||||||
|
|
||||||
|
|
||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
@@ -40,6 +40,7 @@ class Qwen2VLImageProcessingTester:
|
|||||||
parent,
|
parent,
|
||||||
batch_size=7,
|
batch_size=7,
|
||||||
num_channels=3,
|
num_channels=3,
|
||||||
|
num_frames=10,
|
||||||
min_resolution=56,
|
min_resolution=56,
|
||||||
max_resolution=1024,
|
max_resolution=1024,
|
||||||
min_pixels=56 * 56,
|
min_pixels=56 * 56,
|
||||||
@@ -58,6 +59,7 @@ class Qwen2VLImageProcessingTester:
|
|||||||
self.min_resolution = min_resolution
|
self.min_resolution = min_resolution
|
||||||
self.max_resolution = max_resolution
|
self.max_resolution = max_resolution
|
||||||
self.num_channels = num_channels
|
self.num_channels = num_channels
|
||||||
|
self.num_frames = num_frames
|
||||||
self.image_mean = OPENAI_CLIP_MEAN
|
self.image_mean = OPENAI_CLIP_MEAN
|
||||||
self.image_std = OPENAI_CLIP_STD
|
self.image_std = OPENAI_CLIP_STD
|
||||||
self.min_pixels = min_pixels
|
self.min_pixels = min_pixels
|
||||||
@@ -95,6 +97,18 @@ class Qwen2VLImageProcessingTester:
|
|||||||
)
|
)
|
||||||
return [[image] for image in images]
|
return [[image] for image in images]
|
||||||
|
|
||||||
|
def prepare_video_inputs(self, equal_resolution=False, numpify=False, torchify=False):
|
||||||
|
return prepare_video_inputs(
|
||||||
|
batch_size=self.batch_size,
|
||||||
|
num_channels=self.num_channels,
|
||||||
|
num_frames=self.num_frames,
|
||||||
|
min_resolution=self.min_resolution,
|
||||||
|
max_resolution=self.max_resolution,
|
||||||
|
equal_resolution=equal_resolution,
|
||||||
|
numpify=numpify,
|
||||||
|
torchify=torchify,
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
@require_vision
|
@require_vision
|
||||||
@@ -247,3 +261,26 @@ class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
|
|||||||
# Image processor should return same pixel values, independently of ipnut format
|
# Image processor should return same pixel values, independently of ipnut format
|
||||||
self.assertTrue((encoded_images_nested == encoded_images).all())
|
self.assertTrue((encoded_images_nested == encoded_images).all())
|
||||||
self.assertTrue((image_grid_thws_nested == expected_image_grid_thws).all())
|
self.assertTrue((image_grid_thws_nested == expected_image_grid_thws).all())
|
||||||
|
|
||||||
|
def test_video_inputs(self):
|
||||||
|
image_processing = self.image_processing_class(**self.image_processor_dict)
|
||||||
|
expected_dims_by_frames = {1: 34300, 2: 34300, 3: 68600, 4: 68600, 5: 102900, 6: 102900}
|
||||||
|
|
||||||
|
for num_frames, expected_dims in expected_dims_by_frames.items():
|
||||||
|
image_processor_tester = Qwen2VLImageProcessingTester(self, num_frames=num_frames)
|
||||||
|
video_inputs = image_processor_tester.prepare_video_inputs(equal_resolution=True)
|
||||||
|
prcocess_out = image_processing(None, videos=video_inputs, return_tensors="pt")
|
||||||
|
encoded_video = prcocess_out.pixel_values_videos
|
||||||
|
expected_output_video_shape = (expected_dims, 1176)
|
||||||
|
self.assertEqual(tuple(encoded_video.shape), expected_output_video_shape)
|
||||||
|
|
||||||
|
def test_custom_patch_size(self):
|
||||||
|
image_processing = self.image_processing_class(**self.image_processor_dict)
|
||||||
|
|
||||||
|
for patch_size in (1, 3, 5, 7):
|
||||||
|
image_processor_tester = Qwen2VLImageProcessingTester(self, patch_size=patch_size)
|
||||||
|
video_inputs = image_processor_tester.prepare_video_inputs(equal_resolution=True)
|
||||||
|
prcocess_out = image_processing(None, videos=video_inputs, return_tensors="pt")
|
||||||
|
encoded_video = prcocess_out.pixel_values_videos
|
||||||
|
expected_output_video_shape = (171500, 1176)
|
||||||
|
self.assertEqual(tuple(encoded_video.shape), expected_output_video_shape)
|
||||||
|
|||||||
@@ -253,7 +253,7 @@ class Qwen2VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCas
|
|||||||
"""
|
"""
|
||||||
Tests that VLMs through an error with explicit message saying what is wrong
|
Tests that VLMs through an error with explicit message saying what is wrong
|
||||||
when number of images don't match number of image tokens in the text.
|
when number of images don't match number of image tokens in the text.
|
||||||
Also we need to test multi-image cases when one prompr has multiple image tokens.
|
Also we need to test multi-image cases when one prompt has multiple image tokens.
|
||||||
"""
|
"""
|
||||||
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||||
for model_class in self.all_model_classes:
|
for model_class in self.all_model_classes:
|
||||||
|
|||||||
@@ -125,7 +125,7 @@ def prepare_video_inputs(
|
|||||||
assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
|
assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
|
||||||
|
|
||||||
video_inputs = []
|
video_inputs = []
|
||||||
for i in range(batch_size):
|
for _ in range(batch_size):
|
||||||
if equal_resolution:
|
if equal_resolution:
|
||||||
width = height = max_resolution
|
width = height = max_resolution
|
||||||
else:
|
else:
|
||||||
|
|||||||
Reference in New Issue
Block a user