Fix PixtralProcessor patch_size when spatial_merge_size is used (#37019)

This commit is contained in:
Michael Goin
2025-03-27 03:46:23 -06:00
committed by GitHub
parent 49b5ab6a27
commit 927ce1d39f

View File

@@ -156,6 +156,8 @@ class PixtralProcessor(ProcessorMixin):
**kwargs, **kwargs,
) )
patch_size = self.patch_size * self.spatial_merge_size
if images is not None: if images is not None:
if is_image_or_image_url(images): if is_image_or_image_url(images):
images = [images] images = [images]
@@ -172,7 +174,7 @@ class PixtralProcessor(ProcessorMixin):
"Invalid input images. Please provide a single image, a list of images, or a list of lists of images." "Invalid input images. Please provide a single image, a list of images, or a list of lists of images."
) )
images = [load_image(im) if isinstance(im, str) else im for im in images] images = [load_image(im) if isinstance(im, str) else im for im in images]
image_inputs = self.image_processor(images, patch_size=self.patch_size, **output_kwargs["images_kwargs"]) image_inputs = self.image_processor(images, patch_size=patch_size, **output_kwargs["images_kwargs"])
else: else:
image_inputs = {} image_inputs = {}
@@ -192,8 +194,8 @@ class PixtralProcessor(ProcessorMixin):
for sample in text: for sample in text:
while self.image_token in sample: while self.image_token in sample:
height, width = next(image_sizes) height, width = next(image_sizes)
num_height_tokens = height // (self.patch_size * self.spatial_merge_size) num_height_tokens = height // patch_size
num_width_tokens = width // (self.patch_size * self.spatial_merge_size) num_width_tokens = width // patch_size
replace_tokens = [ replace_tokens = [
[self.image_token] * num_width_tokens + [self.image_break_token] [self.image_token] * num_width_tokens + [self.image_break_token]
] * num_height_tokens ] * num_height_tokens