[vlm] adjust max length for special tokens (#37342)

* update

* apply suggestion

* fix tests for main branch

* remove unused logger

* add special tokens in tests

* nit

* fix more tests

* fix test

* pg also
This commit is contained in:
Raushan Turganbay
2025-04-16 20:49:20 +02:00
committed by GitHub
parent c94c59fc47
commit 32eca7197a
39 changed files with 414 additions and 98 deletions

View File

@@ -946,6 +946,8 @@ class AriaProcessor(ProcessorMixin):
size_conversion = {490: 128, 980: 256}
self.size_conversion = {int(k): v for k, v in size_conversion.items()}
self.image_token = tokenizer.image_token
self.image_token_id = tokenizer.image_token_id
if tokenizer is not None and tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.unk_token
@@ -986,10 +988,12 @@ class AriaProcessor(ProcessorMixin):
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
**kwargs,
)
if isinstance(text, str):
text = [text]
elif not isinstance(text, list) and not isinstance(text[0], str):
raise ValueError("Invalid input text. Please provide a string, or a list of strings")
if images is not None:
image_inputs = self.image_processor(
images,
@@ -1007,12 +1011,11 @@ class AriaProcessor(ProcessorMixin):
image_inputs = {}
prompt_strings = text
text_inputs = self.tokenizer(
prompt_strings,
**output_kwargs["text_kwargs"],
)
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"])
return BatchFeature(data={**text_inputs, **image_inputs})
return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
def batch_decode(self, *args, **kwargs):
"""

View File

@@ -72,6 +72,8 @@ class AriaProcessor(ProcessorMixin):
size_conversion = {490: 128, 980: 256}
self.size_conversion = {int(k): v for k, v in size_conversion.items()}
self.image_token = tokenizer.image_token
self.image_token_id = tokenizer.image_token_id
if tokenizer is not None and tokenizer.pad_token is None:
tokenizer.pad_token = tokenizer.unk_token
@@ -112,10 +114,12 @@ class AriaProcessor(ProcessorMixin):
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
**kwargs,
)
if isinstance(text, str):
text = [text]
elif not isinstance(text, list) and not isinstance(text[0], str):
raise ValueError("Invalid input text. Please provide a string, or a list of strings")
if images is not None:
image_inputs = self.image_processor(
images,
@@ -133,12 +137,11 @@ class AriaProcessor(ProcessorMixin):
image_inputs = {}
prompt_strings = text
text_inputs = self.tokenizer(
prompt_strings,
**output_kwargs["text_kwargs"],
)
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"])
return BatchFeature(data={**text_inputs, **image_inputs})
return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
def batch_decode(self, *args, **kwargs):
"""

View File

@@ -121,6 +121,7 @@ class AyaVisionProcessor(ProcessorMixin):
super().__init__(image_processor, tokenizer, chat_template=chat_template)
self.image_token = image_token
self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
self.patch_size = patch_size * downsample_factor
self.img_size = img_size
@@ -224,9 +225,11 @@ class AyaVisionProcessor(ProcessorMixin):
text = processed_text
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
return BatchFeature(data={**text_inputs, **image_inputs})
return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
def batch_decode(self, *args, **kwargs):
"""

View File

@@ -68,6 +68,7 @@ class ChameleonProcessor(ProcessorMixin):
def __init__(self, image_processor, tokenizer, image_seq_length: int = 1024, image_token: str = "<image>"):
self.image_seq_length = image_seq_length
self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
self.image_start_token = (
tokenizer.boi_token if hasattr(tokenizer, "boi_token") else "<racm3:break>"
) # fixed tokens for start and end, so can hardcode
@@ -140,12 +141,14 @@ class ChameleonProcessor(ProcessorMixin):
sample += self.tokenizer.sep_token # special Chameleon treatment to add sep for chat mode
prompt_strings.append(sample)
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
data = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
self._check_special_mm_tokens(prompt_strings, data, modalities=["image"])
if images is not None:
data["pixel_values"] = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"]
return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"]["return_tensors"])
return BatchFeature(data=data, tensor_type=return_tensors)
# Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
def batch_decode(self, *args, **kwargs):

View File

@@ -75,6 +75,7 @@ class Emu3Processor(ProcessorMixin):
**kwargs,
):
self.image_token = tokenizer.image_token # image_token as placeholder to be replaced by vq-vae tokens
self.image_token_id = tokenizer.image_token_id
self.image_start_token = tokenizer.boi_token # "<|image start|>" fixed tokens for start and end of image
self.image_end_token = tokenizer.eoi_token # "<|image end|>"
self.fake_token_around_image = tokenizer.image_wrapper_token # "<|image token|>" every image starts with it
@@ -177,10 +178,13 @@ class Emu3Processor(ProcessorMixin):
image_features["image_sizes"] = [[height, width]] * len(text)
# else just generate from text-only input, and we do no special treatment for text
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
data = self.tokenizer(text, **output_kwargs["text_kwargs"])
self._check_special_mm_tokens(text, data, modalities=["image"])
data.update(**image_features)
return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].pop("return_tensors", None))
return BatchFeature(data=data, tensor_type=return_tensors)
def calculate_generate_size(self, ratio, image_area, spatial_factor):
width, height = map(int, ratio.split(":"))

View File

@@ -65,7 +65,7 @@ class Gemma3Processor(ProcessorMixin):
self.image_seq_length = image_seq_length
self.image_token_id = tokenizer.image_token_id
self.boi_token = tokenizer.boi_token
self.image_token = tokenizer.boi_token
self.image_token = tokenizer.image_token
image_tokens_expanded = "".join([tokenizer.image_token] * image_seq_length)
self.full_image_sequence = f"\n\n{tokenizer.boi_token}{image_tokens_expanded}{tokenizer.eoi_token}\n\n"
@@ -138,6 +138,7 @@ class Gemma3Processor(ProcessorMixin):
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"], return_tensors="np")
self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
# Add token type ids manually, as tokenizer can't do arbitrary position token types
array_ids = text_inputs["input_ids"]

View File

@@ -107,6 +107,8 @@ class GotOcr2Processor(ProcessorMixin):
self.img_start_token = "<img>"
self.img_end_token = "</img>"
self.img_pad_token = "<imgpad>"
self.image_token = "<imgpad>" # keep the above for BC, but we need to call it `image_token`
self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
self.system_query = "system\nYou should follow the instructions carefully and explain your answers in detail."
def _make_list_of_inputs(self, images, text, box, color, multi_page):
@@ -250,8 +252,11 @@ class GotOcr2Processor(ProcessorMixin):
)
text.append(prompt)
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
return BatchFeature(data={**text_inputs, **image_inputs})
self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
def batch_decode(self, *args, **kwargs):
"""

View File

@@ -98,13 +98,15 @@ class Idefics2Processor(ProcessorMixin):
raise ValueError("You need to specify a `tokenizer`.")
if not hasattr(tokenizer, "image_token"):
self.fake_image_token = AddedToken("<fake_token_around_image>", normalized=False, special=True)
self.image_token = AddedToken("<image>", normalized=False, special=True)
self.fake_image_token = AddedToken("<fake_token_around_image>", normalized=False, special=True).content
self.image_token = AddedToken("<image>", normalized=False, special=True).content
tokens_to_add = {"additional_special_tokens": [self.fake_image_token, self.image_token]}
tokenizer.add_special_tokens(tokens_to_add)
self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
else:
self.fake_image_token = tokenizer.image_boundary_token
self.image_token = tokenizer.image_token
self.image_token_id = tokenizer.image_token_id
self.end_of_utterance_token = AddedToken("<end_of_utterance>", normalized=False, special=True)
tokenizer.add_special_tokens({"additional_special_tokens": [self.end_of_utterance_token]})
@@ -190,9 +192,10 @@ class Idefics2Processor(ProcessorMixin):
)
image_seq_len = output_kwargs["images_kwargs"].pop("image_seq_len", None)
image_seq_len = image_seq_len if image_seq_len is not None else self.image_seq_len
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
n_images_in_text = []
inputs = BatchFeature()
inputs = {}
if text is not None:
if isinstance(text, str):
@@ -201,13 +204,14 @@ class Idefics2Processor(ProcessorMixin):
raise ValueError("Invalid input text. Please provide a string, or a list of strings")
# Replace the image token with fake tokens around the expanded image token sequence of length `image_seq_len`
fake_image_token = self.fake_image_token.content
image_token = self.image_token.content
fake_image_token = self.fake_image_token
image_token = self.image_token
image_str = f"{fake_image_token}{image_token * image_seq_len}{fake_image_token}"
if self.image_processor.do_image_splitting:
# A single image token is split into 4 patches + 1 original image
image_str = image_str * 5
image_seq_len *= 5
prompt_strings = []
for sample in text:
@@ -218,6 +222,7 @@ class Idefics2Processor(ProcessorMixin):
prompt_strings.append(sample)
text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"])
inputs.update(text_inputs)
if images is not None:
@@ -259,7 +264,7 @@ class Idefics2Processor(ProcessorMixin):
image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
inputs.update(image_inputs)
return inputs
return BatchFeature(inputs, tensor_type=return_tensors)
def batch_decode(self, *args, **kwargs):
"""

View File

@@ -141,9 +141,9 @@ class Idefics3Processor(ProcessorMixin):
if tokenizer is None:
raise ValueError("You need to specify a `tokenizer`.")
self.fake_image_token = AddedToken("<fake_token_around_image>", normalized=False, special=True)
self.image_token = AddedToken("<image>", normalized=False, special=True)
self.end_of_utterance_token = AddedToken("<end_of_utterance>", normalized=False, special=True)
self.fake_image_token = AddedToken("<fake_token_around_image>", normalized=False, special=True).content
self.image_token = AddedToken("<image>", normalized=False, special=True).content
self.end_of_utterance_token = AddedToken("<end_of_utterance>", normalized=False, special=True).content
self.global_image_tag = "<global-img>" # https://github.com/huggingface/transformers/pull/32473/files/8063e5e17362571b693f1db95167f5443a3be1b2#r1734825341
self.image_seq_len = image_seq_len
@@ -159,6 +159,7 @@ class Idefics3Processor(ProcessorMixin):
]
}
tokenizer.add_special_tokens(tokens_to_add)
self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
super().__init__(image_processor, tokenizer, chat_template=chat_template, **kwargs)
@@ -240,17 +241,18 @@ class Idefics3Processor(ProcessorMixin):
)
image_seq_len = image_seq_len if image_seq_len is not None else self.image_seq_len
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
n_images_in_text = []
n_images_in_images = []
inputs = BatchFeature()
inputs = {}
if text is not None:
if isinstance(text, str):
text = [text]
elif not isinstance(text, list) and not isinstance(text[0], str):
raise ValueError("Invalid input text. Please provide a string, or a list of strings")
n_images_in_text = [sample.count(self.image_token.content) for sample in text]
n_images_in_text = [sample.count(self.image_token) for sample in text]
if images is not None:
if is_image_or_image_url(images):
@@ -259,8 +261,8 @@ class Idefics3Processor(ProcessorMixin):
if text is not None:
if sum(n_images_in_text) != len(images):
raise ValueError(
f"The total number of {self.image_token.content} tokens in the prompts should be the same as the number of images passed."
f" Found {sum(n_images_in_text)} {self.image_token.content} tokens and {len(images)} images."
f"The total number of {self.image_token} tokens in the prompts should be the same as the number of images passed."
f" Found {sum(n_images_in_text)} {self.image_token} tokens and {len(images)} images."
)
# Reorganize the images to match the prompts
cumsum_images_in_text = [0] + list(accumulate(n_images_in_text))
@@ -295,8 +297,8 @@ class Idefics3Processor(ProcessorMixin):
image_rows = inputs.pop("rows", [[0] * len(text)])
image_cols = inputs.pop("cols", [[0] * len(text)])
fake_image_token = self.fake_image_token.content
image_token = self.image_token.content
fake_image_token = self.fake_image_token
image_token = self.image_token
global_img_token = self.global_image_tag
prompt_strings = []
@@ -324,18 +326,19 @@ class Idefics3Processor(ProcessorMixin):
sample += image_prompt_string + split_sample[i + 1]
prompt_strings.append(sample)
text_inputs = self.tokenizer(text=prompt_strings, **output_kwargs["text_kwargs"])
text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"])
inputs.update(text_inputs)
elif text is not None:
if any(n_images_in_text):
raise ValueError(
f"Found {sum(n_images_in_text)} {self.image_token.content} tokens in the text but no images were passed."
f"Found {sum(n_images_in_text)} {self.image_token} tokens in the text but no images were passed."
)
text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
inputs.update(text_inputs)
return inputs
return BatchFeature(inputs, tensor_type=return_tensors)
def batch_decode(self, *args, **kwargs):
"""

View File

@@ -122,6 +122,7 @@ class Llama4Processor(ProcessorMixin):
self.fake_image_token = fake_image_token
self.image_token = image_token
self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
self.start_of_img_token = start_of_image_token
self.end_of_img_token = end_of_image_token
self.img_patch_token = patch_token
@@ -148,6 +149,7 @@ class Llama4Processor(ProcessorMixin):
img_string += "<|tile_x_separator|>"
img_string += "<|tile_y_separator|>"
img_string += "<|image|>"
img_string += "<|patch|>" * num_patches_per_chunk
img_string += "<|image_end|>"
@@ -247,9 +249,11 @@ class Llama4Processor(ProcessorMixin):
text = processed_text
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
return BatchFeature(data={**text_inputs, **image_inputs})
return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
def batch_decode(self, *args, **kwargs):
"""

View File

@@ -173,8 +173,10 @@ class LlavaProcessor(ProcessorMixin):
sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
prompt_strings.append(sample)
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
return BatchFeature(data={**text_inputs, **image_inputs})
self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"])
return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
# Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
def batch_decode(self, *args, **kwargs):

View File

@@ -171,9 +171,11 @@ class LlavaNextProcessor(ProcessorMixin):
prompt_strings.append(sample)
prompt_strings = [sample.replace("<placeholder>", self.image_token) for sample in prompt_strings]
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"])
return BatchFeature(data={**text_inputs, **image_inputs})
return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
def _get_number_of_features(self, orig_height: int, orig_width: int, height: int, width: int) -> int:
image_grid_pinpoints = self.image_processor.image_grid_pinpoints

View File

@@ -224,8 +224,11 @@ class LlavaNextVideoProcessor(ProcessorMixin):
prompt_strings.append(sample)
text = prompt_strings
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"])
return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors)
# Copied from transformers.models.llava_next.processing_llava_next.LlavaNextProcessor._get_number_of_features
def _get_number_of_features(self, orig_height: int, orig_width: int, height: int, width: int) -> int:

View File

@@ -171,7 +171,7 @@ class LlavaOnevisionProcessor(ProcessorMixin):
to_numpy_array(image_inputs["pixel_values"][0][0]),
channel_dim=output_kwargs["images_kwargs"].get("data_format"),
)
text = self._expand_image_tokens(text, image_sizes, height, width, self.image_token)
text, num_image_tokens = self._expand_image_tokens(text, image_sizes, height, width, self.image_token)
if videos is not None:
video_inputs = self.video_processor(videos, **output_kwargs["videos_kwargs"])
@@ -188,8 +188,11 @@ class LlavaOnevisionProcessor(ProcessorMixin):
num_video_tokens = (num_frames * pooled_height_width * pooled_height_width) + 1 # +1 for newline token
text = [sample.replace(self.video_token, self.video_token * num_video_tokens) for sample in text]
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
return BatchFeature(data={**text_inputs, **image_inputs, **video_inputs})
self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
return BatchFeature(data={**text_inputs, **image_inputs, **video_inputs}, tensor_type=return_tensors)
def _expand_image_tokens(
self,
@@ -201,6 +204,7 @@ class LlavaOnevisionProcessor(ProcessorMixin):
num_frames: int = 1,
):
prompt_strings = []
max_num_vision_tokens = 0
for sample in text:
while special_token in sample:
image_size_list = next(image_sizes)
@@ -210,12 +214,13 @@ class LlavaOnevisionProcessor(ProcessorMixin):
original_size = original_size.tolist()
orig_height, orig_width = original_size
num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
max_num_vision_tokens = max(max_num_vision_tokens, num_image_tokens)
if self.vision_feature_select_strategy == "default":
num_image_tokens -= 1
sample = sample.replace(special_token, "<placeholder>" * num_image_tokens * num_frames, 1)
prompt_strings.append(sample)
text = [sample.replace("<placeholder>", special_token) for sample in prompt_strings]
return text
return text, max_num_vision_tokens
def _get_number_of_features(self, orig_height: int, orig_width: int, height: int, width: int) -> int:
image_grid_pinpoints = self.image_processor.image_grid_pinpoints

View File

@@ -274,6 +274,7 @@ class MllamaProcessor(ProcessorMixin):
)
text_kwargs = output_kwargs["text_kwargs"]
text_kwargs["return_tensors"] = None
images_kwargs = output_kwargs["images_kwargs"]
common_kwargs = output_kwargs["common_kwargs"]
@@ -287,6 +288,8 @@ class MllamaProcessor(ProcessorMixin):
text = [build_string_from_input(text_item, self.bos_token, self.image_token) for text_item in text]
_ = text_kwargs.pop("padding_side", None) # hack until padding-side is an accepted kwarg by tokenizers
encoding = self.tokenizer(text, **text_kwargs)
self._check_special_mm_tokens(text, encoding, modalities=["image"])
n_images_in_ids = [token_ids.count(self.image_token_id) for token_ids in encoding["input_ids"]]
data.update(encoding)
n_images_in_images = [0]
@@ -301,13 +304,18 @@ class MllamaProcessor(ProcessorMixin):
raise ValueError(
"If a batch of text is provided, there should be either no images or at least one image per sample"
)
if sum(n_images_in_text) > 0 and n_images_in_images != n_images_in_text:
if sum(n_images_in_text) > 0 and (
n_images_in_images != n_images_in_text or n_images_in_ids != n_images_in_images
):
if images is None:
raise ValueError("No image were provided, but there are image tokens in the prompt")
else:
add_message = ""
if sum(n_images_in_images) == sum(n_images_in_text):
if sum(n_images_in_images) == sum(n_images_in_text) and n_images_in_images != n_images_in_text:
add_message = "Make sure to pass your images as a nested list, where each sub-list holds images per batch"
elif n_images_in_ids != n_images_in_images:
add_message = "If you activated truncation with `max_length`, increase the `max_length` so image tokens aren't cropped."
raise ValueError(
f"The number of image tokens in each text ({n_images_in_text}) should be the same as the "
f"number of provided images per batch ({n_images_in_images}). {add_message}"

View File

@@ -298,23 +298,21 @@ class PaliGemmaProcessor(ProcessorMixin):
suffix = [sfx + self.tokenizer.eos_token for sfx in suffix]
pixel_values = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"]
# max_length has to account for the image tokens
if output_kwargs["text_kwargs"].get("max_length", None) is not None:
output_kwargs["text_kwargs"]["max_length"] += self.image_seq_length
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
inputs = self.tokenizer(
input_strings,
text_pair=suffix,
return_token_type_ids=return_token_type_ids,
**output_kwargs["text_kwargs"],
)
self._check_special_mm_tokens(input_strings, inputs, modalities=["image"])
return_data = {**inputs, "pixel_values": pixel_values}
if return_token_type_ids:
labels = inputs["input_ids"].masked_fill(inputs["token_type_ids"] == 0, -100)
return_data.update({"labels": labels})
return BatchFeature(data=return_data)
return BatchFeature(data=return_data, tensor_type=return_tensors)
# Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Gemma
def batch_decode(self, *args, **kwargs):

View File

@@ -71,6 +71,10 @@ class Phi4MultimodalProcessor(ProcessorMixin):
tokenizer,
**kwargs,
):
self.image_token = tokenizer.image_token
self.image_token_id = tokenizer.image_token_id
self.audio_token = tokenizer.audio_token
self.audio_token_id = tokenizer.audio_token_id
super().__init__(image_processor, audio_processor, tokenizer, **kwargs)
def __call__(
@@ -113,7 +117,6 @@ class Phi4MultimodalProcessor(ProcessorMixin):
output_kwargs = self._merge_kwargs(Phi4MultimodalProcessorKwargs, self.tokenizer.init_kwargs, **kwargs)
image_kwargs = output_kwargs["images_kwargs"]
audio_kwargs = output_kwargs["audio_kwargs"]
text_kwargs = output_kwargs["text_kwargs"]
image_inputs = self.image_processor(images, **image_kwargs) if images is not None else {}
audio_inputs = self.audio_processor(audio, **audio_kwargs) if audio is not None else {}
@@ -154,7 +157,9 @@ class Phi4MultimodalProcessor(ProcessorMixin):
re.sub(re.escape(audio_token), lambda _: audio_token * next(audio_count_iter), t) for t in processed_text
]
text_inputs = self.tokenizer(processed_text, **text_kwargs)
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
text_inputs = self.tokenizer(processed_text, **output_kwargs["text_kwargs"])
self._check_special_mm_tokens(processed_text, text_inputs, modalities=["image"])
# prepare batch feature
data = {
@@ -163,7 +168,7 @@ class Phi4MultimodalProcessor(ProcessorMixin):
**audio_inputs,
}
return BatchFeature(data=data)
return BatchFeature(data=data, tensor_type=return_tensors)
def batch_decode(self, *args, **kwargs):
"""

View File

@@ -103,6 +103,7 @@ class PixtralProcessor(ProcessorMixin):
self.patch_size = patch_size
self.spatial_merge_size = spatial_merge_size
self.image_token = image_token
self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
self.image_break_token = image_break_token
self.image_end_token = image_end_token
super().__init__(image_processor, tokenizer, chat_template=chat_template)
@@ -211,10 +212,10 @@ class PixtralProcessor(ProcessorMixin):
sample = sample.replace("<placeholder>", replace_str, 1)
prompt_strings.append(sample)
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
return BatchFeature(
data={**text_inputs, **image_inputs}, tensor_type=output_kwargs["common_kwargs"]["return_tensors"]
)
self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"])
return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
# Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
def batch_decode(self, *args, **kwargs):

View File

@@ -941,16 +941,14 @@ class Qwen2_5_VLProcessor(Qwen2VLProcessor):
if not isinstance(text, list):
text = [text]
text = text.copy() # below lines change text in-place
if image_grid_thw is not None:
merge_length = self.image_processor.merge_size**2
index = 0
for i in range(len(text)):
while self.image_token in text[i]:
text[i] = text[i].replace(
self.image_token,
"<|placeholder|>" * (image_grid_thw[index].prod() // merge_length),
1,
)
num_image_tokens = image_grid_thw[index].prod() // merge_length
text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
index += 1
text[i] = text[i].replace("<|placeholder|>", self.image_token)
@@ -959,17 +957,16 @@ class Qwen2_5_VLProcessor(Qwen2VLProcessor):
index = 0
for i in range(len(text)):
while self.video_token in text[i]:
text[i] = text[i].replace(
self.video_token,
"<|placeholder|>" * (video_grid_thw[index].prod() // merge_length),
1,
)
num_video_tokens = video_grid_thw[index].prod() // merge_length
text[i] = text[i].replace(self.video_token, "<|placeholder|>" * num_video_tokens, 1)
index += 1
text[i] = text[i].replace("<|placeholder|>", self.video_token)
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"])
return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors)
__all__ = [

View File

@@ -77,6 +77,16 @@ class Qwen2_5_VLProcessor(ProcessorMixin):
def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
self.image_token_id = (
tokenizer.image_token_id
if getattr(tokenizer, "image_token_id", None)
else tokenizer.convert_tokens_to_ids(self.image_token)
)
self.video_token_id = (
tokenizer.video_token_id
if getattr(tokenizer, "video_token_id", None)
else tokenizer.convert_tokens_to_ids(self.video_token)
)
super().__init__(image_processor, tokenizer, chat_template=chat_template)
def __call__(
@@ -157,16 +167,14 @@ class Qwen2_5_VLProcessor(ProcessorMixin):
if not isinstance(text, list):
text = [text]
text = text.copy() # below lines change text in-place
if image_grid_thw is not None:
merge_length = self.image_processor.merge_size**2
index = 0
for i in range(len(text)):
while self.image_token in text[i]:
text[i] = text[i].replace(
self.image_token,
"<|placeholder|>" * (image_grid_thw[index].prod() // merge_length),
1,
)
num_image_tokens = image_grid_thw[index].prod() // merge_length
text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
index += 1
text[i] = text[i].replace("<|placeholder|>", self.image_token)
@@ -175,17 +183,16 @@ class Qwen2_5_VLProcessor(ProcessorMixin):
index = 0
for i in range(len(text)):
while self.video_token in text[i]:
text[i] = text[i].replace(
self.video_token,
"<|placeholder|>" * (video_grid_thw[index].prod() // merge_length),
1,
)
num_video_tokens = video_grid_thw[index].prod() // merge_length
text[i] = text[i].replace(self.video_token, "<|placeholder|>" * num_video_tokens, 1)
index += 1
text[i] = text[i].replace("<|placeholder|>", self.video_token)
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"])
return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors)
def batch_decode(self, *args, **kwargs):
"""

View File

@@ -76,6 +76,7 @@ class Qwen2AudioProcessor(ProcessorMixin):
if chat_template is None:
chat_template = self.default_chat_template
self.audio_token = tokenizer.audio_token if hasattr(tokenizer, "audio_token") else audio_token
self.audio_token_id = tokenizer.convert_tokens_to_ids(self.audio_token)
self.audio_bos_token = tokenizer.audio_bos_token if hasattr(tokenizer, "audio_bos_token") else audio_bos_token
self.audio_eos_token = tokenizer.audio_eos_token if hasattr(tokenizer, "audio_eos_token") else audio_eos_token
super().__init__(feature_extractor, tokenizer, chat_template=chat_template)
@@ -179,12 +180,14 @@ class Qwen2AudioProcessor(ProcessorMixin):
expanded_text.append(sample)
text = expanded_text
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
self._check_special_mm_tokens(text, inputs, modalities=["audio"])
if audio is not None:
inputs.update(audio_inputs)
return BatchFeature(data={**inputs})
return BatchFeature(data={**inputs}, tensor_type=return_tensors)
def batch_decode(self, *args, **kwargs):
"""

View File

@@ -72,6 +72,16 @@ class Qwen2VLProcessor(ProcessorMixin):
def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
self.image_token_id = (
tokenizer.image_token_id
if getattr(tokenizer, "image_token_id", None)
else tokenizer.convert_tokens_to_ids(self.image_token)
)
self.video_token_id = (
tokenizer.video_token_id
if getattr(tokenizer, "video_token_id", None)
else tokenizer.convert_tokens_to_ids(self.video_token)
)
super().__init__(image_processor, tokenizer, chat_template=chat_template)
def __call__(
@@ -139,14 +149,15 @@ class Qwen2VLProcessor(ProcessorMixin):
if not isinstance(text, list):
text = [text]
text = text.copy() # below lines change text in-place
if image_grid_thw is not None:
merge_length = self.image_processor.merge_size**2
index = 0
for i in range(len(text)):
while self.image_token in text[i]:
text[i] = text[i].replace(
self.image_token, "<|placeholder|>" * (image_grid_thw[index].prod() // merge_length), 1
)
num_image_tokens = image_grid_thw[index].prod() // merge_length
text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
index += 1
text[i] = text[i].replace("<|placeholder|>", self.image_token)
@@ -155,15 +166,15 @@ class Qwen2VLProcessor(ProcessorMixin):
index = 0
for i in range(len(text)):
while self.video_token in text[i]:
text[i] = text[i].replace(
self.video_token, "<|placeholder|>" * (video_grid_thw[index].prod() // merge_length), 1
)
num_video_tokens = video_grid_thw[index].prod() // merge_length
text[i] = text[i].replace(self.video_token, "<|placeholder|>" * num_video_tokens, 1)
index += 1
text[i] = text[i].replace("<|placeholder|>", self.video_token)
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"])
return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors)
def batch_decode(self, *args, **kwargs):
"""

View File

@@ -149,6 +149,7 @@ class SmolVLMProcessor(ProcessorMixin):
):
self.fake_image_token = getattr(tokenizer, "fake_image_token", "<fake_token_around_image>")
self.image_token = getattr(tokenizer, "image_token", "<image>")
self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
self.end_of_utterance_token = getattr(tokenizer, "end_of_utterance_token", "<end_of_utterance>")
self.global_image_token = getattr(tokenizer, "global_image_token", "<global-img>")
self.image_seq_len = image_seq_len
@@ -290,7 +291,7 @@ class SmolVLMProcessor(ProcessorMixin):
if n_images_in_text > 0 and (images is None and videos is None):
raise ValueError(f"We detected {n_images_in_text} tokens in the text but no images/videos were passed")
inputs = BatchFeature()
inputs = {}
# Images and videos are mutually exclusive, so process one which is present
if images is not None:
images = make_nested_list_of_images(images)
@@ -313,11 +314,14 @@ class SmolVLMProcessor(ProcessorMixin):
)
inputs.update(vision_inputs)
return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
if text is not None:
text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
inputs.update(text_inputs)
return inputs
return BatchFeature(inputs, tensor_type=return_tensors)
def _process_messages_for_chat_template(
self,

View File

@@ -87,6 +87,8 @@ class VideoLlavaProcessor(ProcessorMixin):
self.vision_feature_select_strategy = vision_feature_select_strategy
self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
self.video_token = tokenizer.video_token if hasattr(tokenizer, "video_token") else video_token
self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
self.video_token_id = tokenizer.convert_tokens_to_ids(self.video_token)
super().__init__(image_processor, tokenizer, chat_template=chat_template)
def __call__(
@@ -195,14 +197,16 @@ class VideoLlavaProcessor(ProcessorMixin):
text_inputs = self.tokenizer(
prompt_strings,
return_tensors=return_tensors,
return_tensors=None,
padding=padding,
truncation=truncation,
max_length=max_length,
)
self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image", "video"])
data.update(text_inputs)
return BatchFeature(data=data)
return BatchFeature(data=data, tensor_type=return_tensors)
# Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
def batch_decode(self, *args, **kwargs):

View File

@@ -31,6 +31,7 @@ from huggingface_hub.errors import EntryNotFoundError
from .audio_utils import load_audio
from .dynamic_module_utils import custom_object_save
from .feature_extraction_utils import BatchFeature
from .image_utils import (
ChannelDimension,
ImageInput,
@@ -1615,6 +1616,23 @@ class ProcessorMixin(PushToHubMixin):
"""
return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=skip_special_tokens, **kwargs)
def _check_special_mm_tokens(self, text: list[str], text_inputs: "BatchFeature", modalities: list[str]):
"""
Checks that number of special tokens in text and processed text is same. The count can be different
if tokenized text was truncated, leading to issues in model code.
"""
for modality in modalities:
token_str = getattr(self, f"{modality}_token")
token_id = getattr(self, f"{modality}_token_id")
ids_count = [list(ids).count(token_id) for ids in text_inputs["input_ids"]]
text_count = [sample.count(token_str) for sample in text]
if ids_count != text_count:
raise ValueError(
f"Mismatch in `{modality}` token count between text and `input_ids`. Got ids={ids_count} and text={text_count}. "
"Likely due to `truncation='max_length'`. Please disable truncation or increase `max_length`."
)
def _validate_images_text_input_order(images, text):
"""

View File

@@ -271,3 +271,29 @@ And who is that?<|im_end|>
return_tensors="np",
)
self.assertListEqual(list(out_dict[self.images_input_name].shape), [1, 3, 980, 980])
def test_special_mm_token_truncation(self):
"""Tests that special vision tokens do not get truncated when `truncation=True` is set."""
processor = self.get_processor()
input_str = self.prepare_text_inputs(batch_size=2, modality="image")
image_input = self.prepare_image_inputs(batch_size=2)
_ = processor(
text=input_str,
images=image_input,
return_tensors="pt",
truncation=None,
padding=True,
)
with self.assertRaises(ValueError):
_ = processor(
text=input_str,
images=image_input,
return_tensors="pt",
truncation=True,
padding=True,
max_length=3,
)

View File

@@ -40,10 +40,37 @@ class ChameleonProcessorTest(ProcessorTesterMixin, unittest.TestCase):
tokenizer = LlamaTokenizer(vocab_file=SAMPLE_VOCAB)
tokenizer.pad_token_id = 0
tokenizer.sep_token_id = 1
tokenizer.add_special_tokens({"additional_special_tokens": ["<image>"]})
processor = cls.processor_class(image_processor=image_processor, tokenizer=tokenizer, image_seq_length=2)
processor.save_pretrained(cls.tmpdirname)
cls.image_token = processor.image_token
def test_special_mm_token_truncation(self):
"""Tests that special vision tokens do not get truncated when `truncation=True` is set."""
processor = self.get_processor()
input_str = self.prepare_text_inputs(batch_size=2, modality="image")
image_input = self.prepare_image_inputs(batch_size=2)
_ = processor(
text=input_str,
images=image_input,
return_tensors="pt",
truncation=None,
padding=True,
)
with self.assertRaises(ValueError):
_ = processor(
text=input_str,
images=image_input,
return_tensors="pt",
truncation=True,
padding=True,
max_length=20,
)
@staticmethod
def prepare_processor_dict():
return {"image_seq_length": 2} # fmt: skip

View File

@@ -124,3 +124,28 @@ class Gemma3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
# base image + 4 crops
self.assertEqual(len(inputs[self.images_input_name]), 5)
self.assertEqual(len(inputs[self.text_input_name][0]), 67)
def test_special_mm_token_truncation(self):
"""Tests that special vision tokens do not get truncated when `truncation=True` is set."""
processor = self.get_processor()
input_str = self.prepare_text_inputs(batch_size=2, modality="image")
image_input = self.prepare_image_inputs(batch_size=2)
_ = processor(
text=input_str,
images=image_input,
return_tensors="pt",
truncation=None,
padding=True,
)
with self.assertRaises(ValueError):
_ = processor(
text=input_str,
images=image_input,
return_tensors="pt",
truncation=True,
padding=True,
max_length=5,
)

View File

@@ -66,8 +66,8 @@ class Idefics2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
)
)
cls.bos_token = processor.tokenizer.bos_token
cls.image_token = processor.image_token.content
cls.fake_image_token = processor.fake_image_token.content
cls.image_token = processor.image_token
cls.fake_image_token = processor.fake_image_token
cls.bos_token_id = processor.tokenizer.convert_tokens_to_ids(cls.bos_token)
cls.image_token_id = processor.tokenizer.convert_tokens_to_ids(cls.image_token)

View File

@@ -60,8 +60,8 @@ class Idefics3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
)
)
cls.bos_token = processor.tokenizer.bos_token
cls.image_token = processor.image_token.content
cls.fake_image_token = processor.fake_image_token.content
cls.image_token = processor.image_token
cls.fake_image_token = processor.fake_image_token
cls.global_img_token = processor.global_image_tag
cls.bos_token_id = processor.tokenizer.convert_tokens_to_ids(cls.bos_token)

View File

@@ -40,6 +40,7 @@ class LlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
image_processor = CLIPImageProcessor(do_center_crop=False)
tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b")
tokenizer.add_special_tokens({"additional_special_tokens": ["<image>"]})
processor_kwargs = cls.prepare_processor_dict()
processor = LlavaProcessor(image_processor, tokenizer, **processor_kwargs)
processor.save_pretrained(cls.tmpdirname)
@@ -79,3 +80,29 @@ class LlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor = LlavaProcessor.from_pretrained(checkpoint)
tokenizer = AutoTokenizer.from_pretrained(checkpoint)
self.assertEqual(processor.tokenizer.__class__, tokenizer.__class__)
def test_special_mm_token_truncation(self):
"""Tests that special vision tokens do not get truncated when `truncation=True` is set."""
processor = LlavaProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
input_str = self.prepare_text_inputs(batch_size=2, modality="image")
image_input = self.prepare_image_inputs(batch_size=2)
_ = processor(
text=input_str,
images=image_input,
return_tensors="pt",
truncation=None,
padding=True,
)
with self.assertRaises(ValueError):
_ = processor(
text=input_str,
images=image_input,
return_tensors="pt",
truncation=True,
padding=True,
max_length=5,
)

View File

@@ -40,6 +40,7 @@ class LlavaNextProcessorTest(ProcessorTesterMixin, unittest.TestCase):
image_processor = LlavaNextImageProcessor()
tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b")
tokenizer.add_special_tokens({"additional_special_tokens": ["<image>"]})
processor_kwargs = cls.prepare_processor_dict()
processor = LlavaNextProcessor(image_processor, tokenizer, **processor_kwargs)
processor.save_pretrained(cls.tmpdirname)

View File

@@ -41,6 +41,7 @@ class LlavaNextVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
image_processor = LlavaNextImageProcessor()
video_processor = LlavaNextVideoImageProcessor()
tokenizer = LlamaTokenizerFast.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf")
tokenizer.add_special_tokens({"additional_special_tokens": ["<image>", "<video>"]})
processor_kwargs = cls.prepare_processor_dict()
processor = LlavaNextVideoProcessor(

View File

@@ -45,6 +45,7 @@ class LlavaOnevisionProcessorTest(ProcessorTesterMixin, unittest.TestCase):
image_processor = LlavaOnevisionImageProcessor()
video_processor = LlavaOnevisionVideoProcessor()
tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
tokenizer.add_special_tokens({"additional_special_tokens": ["<image>", "<video>"]})
processor_kwargs = cls.prepare_processor_dict()
processor = LlavaOnevisionProcessor(

View File

@@ -290,3 +290,29 @@ class Mistral3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
inputs_image = processor(text=prompt_string, images=image_inputs, return_tensors="pt", padding=True)
self.assertIn("input_ids", inputs_image)
self.assertTrue(len(inputs_image["input_ids"]) == 5)
def test_special_mm_token_truncation(self):
"""Tests that special vision tokens do not get truncated when `truncation=True` is set."""
processor = self.get_processor()
input_str = self.prepare_text_inputs(batch_size=2, modality="image")
image_input = self.prepare_image_inputs(batch_size=2)
_ = processor(
text=input_str,
images=image_input,
return_tensors="pt",
truncation=None,
padding=True,
)
with self.assertRaises(ValueError):
_ = processor(
text=input_str,
images=image_input,
return_tensors="pt",
truncation=True,
padding=True,
max_length=3,
)

View File

@@ -360,3 +360,29 @@ class MllamaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
len(inputs[self.text_input_name][0]) == len(inputs[self.text_input_name][1])
and len(inputs[self.text_input_name][1]) < 76
)
def test_special_mm_token_truncation(self):
"""Tests that special vision tokens do not get truncated when `truncation=True` is set."""
processor = self.get_processor()
input_str = self.prepare_text_inputs(batch_size=2, modality="image")
image_input = self.prepare_image_inputs(batch_size=2)
image_input = [[image_input[0]], [image_input[1]]]
_ = processor(
text=input_str,
images=image_input,
return_tensors="pt",
truncation=None,
padding=True,
)
with self.assertRaises(ValueError):
_ = processor(
text=input_str,
images=image_input,
return_tensors="pt",
truncation=True,
padding=True,
max_length=3,
)

View File

@@ -39,6 +39,7 @@ class PaliGemmaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
image_processor = SiglipImageProcessor.from_pretrained("google/siglip-so400m-patch14-384")
image_processor.image_seq_length = 0 # TODO: raushan fix me in #37342
tokenizer = GemmaTokenizer(SAMPLE_VOCAB, keep_accents=True)
tokenizer.add_special_tokens({"additional_special_tokens": ["<image>"]})
processor = PaliGemmaProcessor(image_processor=image_processor, tokenizer=tokenizer)
processor.save_pretrained(cls.tmpdirname)
cls.image_token = processor.image_token
@@ -59,7 +60,7 @@ class PaliGemmaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
inputs = processor(
text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length"
)
self.assertEqual(len(inputs["input_ids"][0]), 112 + 14)
self.assertEqual(len(inputs["input_ids"][0]), 112)
def test_text_with_image_tokens(self):
image_processor = self.get_component("image_processor")

View File

@@ -397,3 +397,29 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
self.assertEqual(inputs[self.images_input_name].shape[0], 100)
inputs = processor(text=input_str, images=image_input, max_pixels=56 * 56 * 4, return_tensors="pt")
self.assertEqual(inputs[self.images_input_name].shape[0], 612)
def test_special_mm_token_truncation(self):
"""Tests that special vision tokens do not get truncated when `truncation=True` is set."""
processor = self.get_processor()
input_str = self.prepare_text_inputs(batch_size=2, modality="image")
image_input = self.prepare_image_inputs(batch_size=2)
_ = processor(
text=input_str,
images=image_input,
return_tensors="pt",
truncation=None,
padding=True,
)
with self.assertRaises(ValueError):
_ = processor(
text=input_str,
images=image_input,
return_tensors="pt",
truncation=True,
padding=True,
max_length=20,
)

View File

@@ -435,7 +435,8 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
processor_kwargs = self.prepare_processor_dict()
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor, **processor_kwargs)
self.skip_processor_without_typed_kwargs(processor)
input_str = self.prepare_text_inputs(batch_size=2, modality="image")
@@ -445,14 +446,14 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
text=input_str,
images=image_input,
return_tensors="pt",
padding="longest",
padding="max_length",
max_length=76,
truncation=True,
max_image_size={"longest_edge": 30},
max_image_size={"longest_edge": 300},
)
self.assertEqual(inputs["pixel_values"].shape[2], 3)
self.assertEqual(inputs["pixel_values"].shape[3], 30)
self.assertEqual(inputs["pixel_values"].shape[3], 300)
self.assertEqual(len(inputs["input_ids"][0]), 76)
@require_torch
@@ -529,3 +530,29 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
with self.assertRaises(ValueError) as context:
processor(text=texts, images=None)
self.assertTrue("tokens in the text but no images/videos were passed" in str(context.exception))
def test_special_mm_token_truncation(self):
"""Tests that special vision tokens do not get truncated when `truncation=True` is set."""
processor = self.get_processor()
input_str = self.prepare_text_inputs(batch_size=2, modality="image")
image_input = self.prepare_image_inputs(batch_size=2)
image_input = [[image_input[0]], [image_input[1]]]
_ = processor(
text=input_str,
images=image_input,
return_tensors="pt",
truncation=None,
padding=True,
)
with self.assertRaises(ValueError):
_ = processor(
text=input_str,
images=image_input,
return_tensors="pt",
truncation=True,
padding=True,
max_length=20,
)