From 927ce1d39fee46ab894519d56dd2454171c13b9b Mon Sep 17 00:00:00 2001 From: Michael Goin Date: Thu, 27 Mar 2025 03:46:23 -0600 Subject: [PATCH] Fix PixtralProcessor patch_size when spatial_merge_size is used (#37019) --- src/transformers/models/pixtral/processing_pixtral.py | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/transformers/models/pixtral/processing_pixtral.py b/src/transformers/models/pixtral/processing_pixtral.py index 853d12e6fe..1a542add69 100644 --- a/src/transformers/models/pixtral/processing_pixtral.py +++ b/src/transformers/models/pixtral/processing_pixtral.py @@ -156,6 +156,8 @@ class PixtralProcessor(ProcessorMixin): **kwargs, ) + patch_size = self.patch_size * self.spatial_merge_size + if images is not None: if is_image_or_image_url(images): images = [images] @@ -172,7 +174,7 @@ class PixtralProcessor(ProcessorMixin): "Invalid input images. Please provide a single image, a list of images, or a list of lists of images." ) images = [load_image(im) if isinstance(im, str) else im for im in images] - image_inputs = self.image_processor(images, patch_size=self.patch_size, **output_kwargs["images_kwargs"]) + image_inputs = self.image_processor(images, patch_size=patch_size, **output_kwargs["images_kwargs"]) else: image_inputs = {} @@ -192,8 +194,8 @@ class PixtralProcessor(ProcessorMixin): for sample in text: while self.image_token in sample: height, width = next(image_sizes) - num_height_tokens = height // (self.patch_size * self.spatial_merge_size) - num_width_tokens = width // (self.patch_size * self.spatial_merge_size) + num_height_tokens = height // patch_size + num_width_tokens = width // patch_size replace_tokens = [ [self.image_token] * num_width_tokens + [self.image_break_token] ] * num_height_tokens