Fix Qwen2VL processor to handle odd number of frames (#35431)

* fix: processing odd number of frames

* feat: add test case

* update: test one frame

* feat: support custom patch size

* fix: test with videos

* revert: change on patch repeat

* fix: much wow

* update: fixups

* fixup pls

* ruff fixup

* fix typo at least
This commit is contained in:
Jacky Lee
2025-01-08 04:49:00 -08:00
committed by GitHub
parent 3fde88b19d
commit 3c1895aa65
4 changed files with 51 additions and 13 deletions

View File

@@ -291,8 +291,9 @@ class Qwen2VLImageProcessor(BaseImageProcessor):
patches = np.array(processed_images) patches = np.array(processed_images)
if data_format == ChannelDimension.LAST: if data_format == ChannelDimension.LAST:
patches = patches.transpose(0, 3, 1, 2) patches = patches.transpose(0, 3, 1, 2)
if patches.shape[0] == 1: if patches.shape[0] % self.temporal_patch_size != 0:
patches = np.tile(patches, (self.temporal_patch_size, 1, 1, 1)) repeats = np.repeat(patches[-1][np.newaxis], self.temporal_patch_size - 1, axis=0)
patches = np.concatenate([patches, repeats], axis=0)
channel = patches.shape[1] channel = patches.shape[1]
grid_t = patches.shape[0] // self.temporal_patch_size grid_t = patches.shape[0] // self.temporal_patch_size
grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size grid_h, grid_w = resized_height // self.patch_size, resized_width // self.patch_size

View File

@@ -22,7 +22,7 @@ from transformers.models.qwen2_vl.image_processing_qwen2_vl import smart_resize
from transformers.testing_utils import require_torch, require_vision from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_torch_available, is_vision_available from transformers.utils import is_torch_available, is_vision_available
from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs from ...test_image_processing_common import ImageProcessingTestMixin, prepare_image_inputs, prepare_video_inputs
if is_torch_available(): if is_torch_available():
@@ -40,6 +40,7 @@ class Qwen2VLImageProcessingTester:
parent, parent,
batch_size=7, batch_size=7,
num_channels=3, num_channels=3,
num_frames=10,
min_resolution=56, min_resolution=56,
max_resolution=1024, max_resolution=1024,
min_pixels=56 * 56, min_pixels=56 * 56,
@@ -58,6 +59,7 @@ class Qwen2VLImageProcessingTester:
self.min_resolution = min_resolution self.min_resolution = min_resolution
self.max_resolution = max_resolution self.max_resolution = max_resolution
self.num_channels = num_channels self.num_channels = num_channels
self.num_frames = num_frames
self.image_mean = OPENAI_CLIP_MEAN self.image_mean = OPENAI_CLIP_MEAN
self.image_std = OPENAI_CLIP_STD self.image_std = OPENAI_CLIP_STD
self.min_pixels = min_pixels self.min_pixels = min_pixels
@@ -95,6 +97,18 @@ class Qwen2VLImageProcessingTester:
) )
return [[image] for image in images] return [[image] for image in images]
def prepare_video_inputs(self, equal_resolution=False, numpify=False, torchify=False):
return prepare_video_inputs(
batch_size=self.batch_size,
num_channels=self.num_channels,
num_frames=self.num_frames,
min_resolution=self.min_resolution,
max_resolution=self.max_resolution,
equal_resolution=equal_resolution,
numpify=numpify,
torchify=torchify,
)
@require_torch @require_torch
@require_vision @require_vision
@@ -247,3 +261,26 @@ class Qwen2VLImageProcessingTest(ImageProcessingTestMixin, unittest.TestCase):
# Image processor should return same pixel values, independently of ipnut format # Image processor should return same pixel values, independently of ipnut format
self.assertTrue((encoded_images_nested == encoded_images).all()) self.assertTrue((encoded_images_nested == encoded_images).all())
self.assertTrue((image_grid_thws_nested == expected_image_grid_thws).all()) self.assertTrue((image_grid_thws_nested == expected_image_grid_thws).all())
def test_video_inputs(self):
image_processing = self.image_processing_class(**self.image_processor_dict)
expected_dims_by_frames = {1: 34300, 2: 34300, 3: 68600, 4: 68600, 5: 102900, 6: 102900}
for num_frames, expected_dims in expected_dims_by_frames.items():
image_processor_tester = Qwen2VLImageProcessingTester(self, num_frames=num_frames)
video_inputs = image_processor_tester.prepare_video_inputs(equal_resolution=True)
prcocess_out = image_processing(None, videos=video_inputs, return_tensors="pt")
encoded_video = prcocess_out.pixel_values_videos
expected_output_video_shape = (expected_dims, 1176)
self.assertEqual(tuple(encoded_video.shape), expected_output_video_shape)
def test_custom_patch_size(self):
image_processing = self.image_processing_class(**self.image_processor_dict)
for patch_size in (1, 3, 5, 7):
image_processor_tester = Qwen2VLImageProcessingTester(self, patch_size=patch_size)
video_inputs = image_processor_tester.prepare_video_inputs(equal_resolution=True)
prcocess_out = image_processing(None, videos=video_inputs, return_tensors="pt")
encoded_video = prcocess_out.pixel_values_videos
expected_output_video_shape = (171500, 1176)
self.assertEqual(tuple(encoded_video.shape), expected_output_video_shape)

View File

@@ -253,7 +253,7 @@ class Qwen2VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCas
""" """
Tests that VLMs through an error with explicit message saying what is wrong Tests that VLMs through an error with explicit message saying what is wrong
when number of images don't match number of image tokens in the text. when number of images don't match number of image tokens in the text.
Also we need to test multi-image cases when one prompr has multiple image tokens. Also we need to test multi-image cases when one prompt has multiple image tokens.
""" """
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common() config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes: for model_class in self.all_model_classes:

View File

@@ -125,19 +125,19 @@ def prepare_video_inputs(
assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time" assert not (numpify and torchify), "You cannot specify both numpy and PyTorch tensors at the same time"
video_inputs = [] video_inputs = []
for i in range(batch_size): for _ in range(batch_size):
if equal_resolution: if equal_resolution:
width = height = max_resolution width = height = max_resolution
else: else:
width, height = np.random.choice(np.arange(min_resolution, max_resolution), 2) width, height = np.random.choice(np.arange(min_resolution, max_resolution), 2)
video = prepare_video( video = prepare_video(
num_frames=num_frames, num_frames=num_frames,
num_channels=num_channels, num_channels=num_channels,
width=width, width=width,
height=height, height=height,
numpify=numpify, numpify=numpify,
torchify=torchify, torchify=torchify,
) )
video_inputs.append(video) video_inputs.append(video)
return video_inputs return video_inputs