From 32eca7197a8d2618417a0d665db38d0af3695a2c Mon Sep 17 00:00:00 2001 From: Raushan Turganbay Date: Wed, 16 Apr 2025 20:49:20 +0200 Subject: [PATCH] [vlm] adjust max length for special tokens (#37342) * update * apply suggestion * fix tests for main branch * remove unused logger * add special tokens in tests * nit * fix more tests * fix test * pg also --- src/transformers/models/aria/modular_aria.py | 13 ++++--- .../models/aria/processing_aria.py | 13 ++++--- .../aya_vision/processing_aya_vision.py | 5 ++- .../models/chameleon/processing_chameleon.py | 5 ++- .../models/emu3/processing_emu3.py | 6 +++- .../models/gemma3/processing_gemma3.py | 3 +- .../models/got_ocr2/processing_got_ocr2.py | 7 +++- .../models/idefics2/processing_idefics2.py | 17 +++++---- .../models/idefics3/processing_idefics3.py | 27 +++++++------- .../models/llama4/processing_llama4.py | 6 +++- .../models/llava/processing_llava.py | 4 ++- .../llava_next/processing_llava_next.py | 4 ++- .../processing_llava_next_video.py | 5 ++- .../processing_llava_onevision.py | 11 ++++-- .../models/mllama/processing_mllama.py | 12 +++++-- .../models/paligemma/processing_paligemma.py | 8 ++--- .../processing_phi4_multimodal.py | 11 ++++-- .../models/pixtral/processing_pixtral.py | 7 ++-- .../models/qwen2_5_vl/modular_qwen2_5_vl.py | 19 +++++----- .../qwen2_5_vl/processing_qwen2_5_vl.py | 29 +++++++++------ .../qwen2_audio/processing_qwen2_audio.py | 5 ++- .../models/qwen2_vl/processing_qwen2_vl.py | 27 +++++++++----- .../models/smolvlm/processing_smolvlm.py | 10 ++++-- .../video_llava/processing_video_llava.py | 8 +++-- src/transformers/processing_utils.py | 18 ++++++++++ tests/models/aria/test_processor_aria.py | 26 ++++++++++++++ .../chameleon/test_processor_chameleon.py | 27 ++++++++++++++ tests/models/gemma3/test_processing_gemma3.py | 25 +++++++++++++ .../idefics2/test_processor_idefics2.py | 4 +-- .../idefics3/test_processor_idefics3.py | 4 +-- tests/models/llava/test_processor_llava.py | 27 ++++++++++++++ .../llava_next/test_processor_llava_next.py | 1 + .../test_processor_llava_next_video.py | 1 + .../test_processor_llava_onevision.py | 1 + .../mistral3/test_processor_mistral3.py | 26 ++++++++++++++ tests/models/mllama/test_processor_mllama.py | 26 ++++++++++++++ .../paligemma/test_processor_paligemma.py | 3 +- .../qwen2_vl/test_processor_qwen2_vl.py | 26 ++++++++++++++ .../models/smolvlm/test_processor_smolvlm.py | 35 ++++++++++++++++--- 39 files changed, 414 insertions(+), 98 deletions(-) diff --git a/src/transformers/models/aria/modular_aria.py b/src/transformers/models/aria/modular_aria.py index d331789354..b087c21536 100644 --- a/src/transformers/models/aria/modular_aria.py +++ b/src/transformers/models/aria/modular_aria.py @@ -946,6 +946,8 @@ class AriaProcessor(ProcessorMixin): size_conversion = {490: 128, 980: 256} self.size_conversion = {int(k): v for k, v in size_conversion.items()} + self.image_token = tokenizer.image_token + self.image_token_id = tokenizer.image_token_id if tokenizer is not None and tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.unk_token @@ -986,10 +988,12 @@ class AriaProcessor(ProcessorMixin): tokenizer_init_kwargs=self.tokenizer.init_kwargs, **kwargs, ) + if isinstance(text, str): text = [text] elif not isinstance(text, list) and not isinstance(text[0], str): raise ValueError("Invalid input text. Please provide a string, or a list of strings") + if images is not None: image_inputs = self.image_processor( images, @@ -1007,12 +1011,11 @@ class AriaProcessor(ProcessorMixin): image_inputs = {} prompt_strings = text - text_inputs = self.tokenizer( - prompt_strings, - **output_kwargs["text_kwargs"], - ) + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) + text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"]) + self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"]) - return BatchFeature(data={**text_inputs, **image_inputs}) + return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors) def batch_decode(self, *args, **kwargs): """ diff --git a/src/transformers/models/aria/processing_aria.py b/src/transformers/models/aria/processing_aria.py index 4b7163db8f..7d30762475 100644 --- a/src/transformers/models/aria/processing_aria.py +++ b/src/transformers/models/aria/processing_aria.py @@ -72,6 +72,8 @@ class AriaProcessor(ProcessorMixin): size_conversion = {490: 128, 980: 256} self.size_conversion = {int(k): v for k, v in size_conversion.items()} + self.image_token = tokenizer.image_token + self.image_token_id = tokenizer.image_token_id if tokenizer is not None and tokenizer.pad_token is None: tokenizer.pad_token = tokenizer.unk_token @@ -112,10 +114,12 @@ class AriaProcessor(ProcessorMixin): tokenizer_init_kwargs=self.tokenizer.init_kwargs, **kwargs, ) + if isinstance(text, str): text = [text] elif not isinstance(text, list) and not isinstance(text[0], str): raise ValueError("Invalid input text. Please provide a string, or a list of strings") + if images is not None: image_inputs = self.image_processor( images, @@ -133,12 +137,11 @@ class AriaProcessor(ProcessorMixin): image_inputs = {} prompt_strings = text - text_inputs = self.tokenizer( - prompt_strings, - **output_kwargs["text_kwargs"], - ) + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) + text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"]) + self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"]) - return BatchFeature(data={**text_inputs, **image_inputs}) + return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors) def batch_decode(self, *args, **kwargs): """ diff --git a/src/transformers/models/aya_vision/processing_aya_vision.py b/src/transformers/models/aya_vision/processing_aya_vision.py index 20837000a8..3b9afecda5 100644 --- a/src/transformers/models/aya_vision/processing_aya_vision.py +++ b/src/transformers/models/aya_vision/processing_aya_vision.py @@ -121,6 +121,7 @@ class AyaVisionProcessor(ProcessorMixin): super().__init__(image_processor, tokenizer, chat_template=chat_template) self.image_token = image_token + self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token) self.patch_size = patch_size * downsample_factor self.img_size = img_size @@ -224,9 +225,11 @@ class AyaVisionProcessor(ProcessorMixin): text = processed_text + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"]) + self._check_special_mm_tokens(text, text_inputs, modalities=["image"]) - return BatchFeature(data={**text_inputs, **image_inputs}) + return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors) def batch_decode(self, *args, **kwargs): """ diff --git a/src/transformers/models/chameleon/processing_chameleon.py b/src/transformers/models/chameleon/processing_chameleon.py index 5c80c7c6c4..f0c592180e 100644 --- a/src/transformers/models/chameleon/processing_chameleon.py +++ b/src/transformers/models/chameleon/processing_chameleon.py @@ -68,6 +68,7 @@ class ChameleonProcessor(ProcessorMixin): def __init__(self, image_processor, tokenizer, image_seq_length: int = 1024, image_token: str = ""): self.image_seq_length = image_seq_length self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token + self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token) self.image_start_token = ( tokenizer.boi_token if hasattr(tokenizer, "boi_token") else "" ) # fixed tokens for start and end, so can hardcode @@ -140,12 +141,14 @@ class ChameleonProcessor(ProcessorMixin): sample += self.tokenizer.sep_token # special Chameleon treatment to add sep for chat mode prompt_strings.append(sample) + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) data = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"]) + self._check_special_mm_tokens(prompt_strings, data, modalities=["image"]) if images is not None: data["pixel_values"] = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"] - return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"]["return_tensors"]) + return BatchFeature(data=data, tensor_type=return_tensors) # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama def batch_decode(self, *args, **kwargs): diff --git a/src/transformers/models/emu3/processing_emu3.py b/src/transformers/models/emu3/processing_emu3.py index ff0e681623..a94dc08cd9 100644 --- a/src/transformers/models/emu3/processing_emu3.py +++ b/src/transformers/models/emu3/processing_emu3.py @@ -75,6 +75,7 @@ class Emu3Processor(ProcessorMixin): **kwargs, ): self.image_token = tokenizer.image_token # image_token as placeholder to be replaced by vq-vae tokens + self.image_token_id = tokenizer.image_token_id self.image_start_token = tokenizer.boi_token # "<|image start|>" fixed tokens for start and end of image self.image_end_token = tokenizer.eoi_token # "<|image end|>" self.fake_token_around_image = tokenizer.image_wrapper_token # "<|image token|>" every image starts with it @@ -177,10 +178,13 @@ class Emu3Processor(ProcessorMixin): image_features["image_sizes"] = [[height, width]] * len(text) # else just generate from text-only input, and we do no special treatment for text + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) data = self.tokenizer(text, **output_kwargs["text_kwargs"]) + self._check_special_mm_tokens(text, data, modalities=["image"]) + data.update(**image_features) - return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].pop("return_tensors", None)) + return BatchFeature(data=data, tensor_type=return_tensors) def calculate_generate_size(self, ratio, image_area, spatial_factor): width, height = map(int, ratio.split(":")) diff --git a/src/transformers/models/gemma3/processing_gemma3.py b/src/transformers/models/gemma3/processing_gemma3.py index bfdf400212..f887e11d5c 100644 --- a/src/transformers/models/gemma3/processing_gemma3.py +++ b/src/transformers/models/gemma3/processing_gemma3.py @@ -65,7 +65,7 @@ class Gemma3Processor(ProcessorMixin): self.image_seq_length = image_seq_length self.image_token_id = tokenizer.image_token_id self.boi_token = tokenizer.boi_token - self.image_token = tokenizer.boi_token + self.image_token = tokenizer.image_token image_tokens_expanded = "".join([tokenizer.image_token] * image_seq_length) self.full_image_sequence = f"\n\n{tokenizer.boi_token}{image_tokens_expanded}{tokenizer.eoi_token}\n\n" @@ -138,6 +138,7 @@ class Gemma3Processor(ProcessorMixin): return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"], return_tensors="np") + self._check_special_mm_tokens(text, text_inputs, modalities=["image"]) # Add token type ids manually, as tokenizer can't do arbitrary position token types array_ids = text_inputs["input_ids"] diff --git a/src/transformers/models/got_ocr2/processing_got_ocr2.py b/src/transformers/models/got_ocr2/processing_got_ocr2.py index 398ec36c9e..5e40d14dee 100644 --- a/src/transformers/models/got_ocr2/processing_got_ocr2.py +++ b/src/transformers/models/got_ocr2/processing_got_ocr2.py @@ -107,6 +107,8 @@ class GotOcr2Processor(ProcessorMixin): self.img_start_token = "" self.img_end_token = "" self.img_pad_token = "" + self.image_token = "" # keep the above for BC, but we need to call it `image_token` + self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token) self.system_query = "system\nYou should follow the instructions carefully and explain your answers in detail." def _make_list_of_inputs(self, images, text, box, color, multi_page): @@ -250,8 +252,11 @@ class GotOcr2Processor(ProcessorMixin): ) text.append(prompt) + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"]) - return BatchFeature(data={**text_inputs, **image_inputs}) + self._check_special_mm_tokens(text, text_inputs, modalities=["image"]) + + return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors) def batch_decode(self, *args, **kwargs): """ diff --git a/src/transformers/models/idefics2/processing_idefics2.py b/src/transformers/models/idefics2/processing_idefics2.py index c69945e282..ab144f3f9d 100644 --- a/src/transformers/models/idefics2/processing_idefics2.py +++ b/src/transformers/models/idefics2/processing_idefics2.py @@ -98,13 +98,15 @@ class Idefics2Processor(ProcessorMixin): raise ValueError("You need to specify a `tokenizer`.") if not hasattr(tokenizer, "image_token"): - self.fake_image_token = AddedToken("", normalized=False, special=True) - self.image_token = AddedToken("", normalized=False, special=True) + self.fake_image_token = AddedToken("", normalized=False, special=True).content + self.image_token = AddedToken("", normalized=False, special=True).content tokens_to_add = {"additional_special_tokens": [self.fake_image_token, self.image_token]} tokenizer.add_special_tokens(tokens_to_add) + self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token) else: self.fake_image_token = tokenizer.image_boundary_token self.image_token = tokenizer.image_token + self.image_token_id = tokenizer.image_token_id self.end_of_utterance_token = AddedToken("", normalized=False, special=True) tokenizer.add_special_tokens({"additional_special_tokens": [self.end_of_utterance_token]}) @@ -190,9 +192,10 @@ class Idefics2Processor(ProcessorMixin): ) image_seq_len = output_kwargs["images_kwargs"].pop("image_seq_len", None) image_seq_len = image_seq_len if image_seq_len is not None else self.image_seq_len + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) n_images_in_text = [] - inputs = BatchFeature() + inputs = {} if text is not None: if isinstance(text, str): @@ -201,13 +204,14 @@ class Idefics2Processor(ProcessorMixin): raise ValueError("Invalid input text. Please provide a string, or a list of strings") # Replace the image token with fake tokens around the expanded image token sequence of length `image_seq_len` - fake_image_token = self.fake_image_token.content - image_token = self.image_token.content + fake_image_token = self.fake_image_token + image_token = self.image_token image_str = f"{fake_image_token}{image_token * image_seq_len}{fake_image_token}" if self.image_processor.do_image_splitting: # A single image token is split into 4 patches + 1 original image image_str = image_str * 5 + image_seq_len *= 5 prompt_strings = [] for sample in text: @@ -218,6 +222,7 @@ class Idefics2Processor(ProcessorMixin): prompt_strings.append(sample) text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"]) + self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"]) inputs.update(text_inputs) if images is not None: @@ -259,7 +264,7 @@ class Idefics2Processor(ProcessorMixin): image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"]) inputs.update(image_inputs) - return inputs + return BatchFeature(inputs, tensor_type=return_tensors) def batch_decode(self, *args, **kwargs): """ diff --git a/src/transformers/models/idefics3/processing_idefics3.py b/src/transformers/models/idefics3/processing_idefics3.py index 0f1cf7b248..1fcce0a453 100644 --- a/src/transformers/models/idefics3/processing_idefics3.py +++ b/src/transformers/models/idefics3/processing_idefics3.py @@ -141,9 +141,9 @@ class Idefics3Processor(ProcessorMixin): if tokenizer is None: raise ValueError("You need to specify a `tokenizer`.") - self.fake_image_token = AddedToken("", normalized=False, special=True) - self.image_token = AddedToken("", normalized=False, special=True) - self.end_of_utterance_token = AddedToken("", normalized=False, special=True) + self.fake_image_token = AddedToken("", normalized=False, special=True).content + self.image_token = AddedToken("", normalized=False, special=True).content + self.end_of_utterance_token = AddedToken("", normalized=False, special=True).content self.global_image_tag = "" # https://github.com/huggingface/transformers/pull/32473/files/8063e5e17362571b693f1db95167f5443a3be1b2#r1734825341 self.image_seq_len = image_seq_len @@ -159,6 +159,7 @@ class Idefics3Processor(ProcessorMixin): ] } tokenizer.add_special_tokens(tokens_to_add) + self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token) super().__init__(image_processor, tokenizer, chat_template=chat_template, **kwargs) @@ -240,17 +241,18 @@ class Idefics3Processor(ProcessorMixin): ) image_seq_len = image_seq_len if image_seq_len is not None else self.image_seq_len + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) n_images_in_text = [] n_images_in_images = [] - inputs = BatchFeature() + inputs = {} if text is not None: if isinstance(text, str): text = [text] elif not isinstance(text, list) and not isinstance(text[0], str): raise ValueError("Invalid input text. Please provide a string, or a list of strings") - n_images_in_text = [sample.count(self.image_token.content) for sample in text] + n_images_in_text = [sample.count(self.image_token) for sample in text] if images is not None: if is_image_or_image_url(images): @@ -259,8 +261,8 @@ class Idefics3Processor(ProcessorMixin): if text is not None: if sum(n_images_in_text) != len(images): raise ValueError( - f"The total number of {self.image_token.content} tokens in the prompts should be the same as the number of images passed." - f" Found {sum(n_images_in_text)} {self.image_token.content} tokens and {len(images)} images." + f"The total number of {self.image_token} tokens in the prompts should be the same as the number of images passed." + f" Found {sum(n_images_in_text)} {self.image_token} tokens and {len(images)} images." ) # Reorganize the images to match the prompts cumsum_images_in_text = [0] + list(accumulate(n_images_in_text)) @@ -295,8 +297,8 @@ class Idefics3Processor(ProcessorMixin): image_rows = inputs.pop("rows", [[0] * len(text)]) image_cols = inputs.pop("cols", [[0] * len(text)]) - fake_image_token = self.fake_image_token.content - image_token = self.image_token.content + fake_image_token = self.fake_image_token + image_token = self.image_token global_img_token = self.global_image_tag prompt_strings = [] @@ -324,18 +326,19 @@ class Idefics3Processor(ProcessorMixin): sample += image_prompt_string + split_sample[i + 1] prompt_strings.append(sample) - text_inputs = self.tokenizer(text=prompt_strings, **output_kwargs["text_kwargs"]) + text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"]) + self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"]) inputs.update(text_inputs) elif text is not None: if any(n_images_in_text): raise ValueError( - f"Found {sum(n_images_in_text)} {self.image_token.content} tokens in the text but no images were passed." + f"Found {sum(n_images_in_text)} {self.image_token} tokens in the text but no images were passed." ) text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"]) inputs.update(text_inputs) - return inputs + return BatchFeature(inputs, tensor_type=return_tensors) def batch_decode(self, *args, **kwargs): """ diff --git a/src/transformers/models/llama4/processing_llama4.py b/src/transformers/models/llama4/processing_llama4.py index 0ca4a44c5e..7ca562571c 100644 --- a/src/transformers/models/llama4/processing_llama4.py +++ b/src/transformers/models/llama4/processing_llama4.py @@ -122,6 +122,7 @@ class Llama4Processor(ProcessorMixin): self.fake_image_token = fake_image_token self.image_token = image_token + self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token) self.start_of_img_token = start_of_image_token self.end_of_img_token = end_of_image_token self.img_patch_token = patch_token @@ -148,6 +149,7 @@ class Llama4Processor(ProcessorMixin): img_string += "<|tile_x_separator|>" img_string += "<|tile_y_separator|>" + img_string += "<|image|>" img_string += "<|patch|>" * num_patches_per_chunk img_string += "<|image_end|>" @@ -247,9 +249,11 @@ class Llama4Processor(ProcessorMixin): text = processed_text + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"]) + self._check_special_mm_tokens(text, text_inputs, modalities=["image"]) - return BatchFeature(data={**text_inputs, **image_inputs}) + return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors) def batch_decode(self, *args, **kwargs): """ diff --git a/src/transformers/models/llava/processing_llava.py b/src/transformers/models/llava/processing_llava.py index 6253e1992f..72a61bff71 100644 --- a/src/transformers/models/llava/processing_llava.py +++ b/src/transformers/models/llava/processing_llava.py @@ -173,8 +173,10 @@ class LlavaProcessor(ProcessorMixin): sample = sample.replace(self.image_token, self.image_token * num_image_tokens) prompt_strings.append(sample) + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"]) - return BatchFeature(data={**text_inputs, **image_inputs}) + self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"]) + return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors) # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama def batch_decode(self, *args, **kwargs): diff --git a/src/transformers/models/llava_next/processing_llava_next.py b/src/transformers/models/llava_next/processing_llava_next.py index 61bb2cdd7f..78175adc21 100644 --- a/src/transformers/models/llava_next/processing_llava_next.py +++ b/src/transformers/models/llava_next/processing_llava_next.py @@ -171,9 +171,11 @@ class LlavaNextProcessor(ProcessorMixin): prompt_strings.append(sample) prompt_strings = [sample.replace("", self.image_token) for sample in prompt_strings] + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"]) + self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"]) - return BatchFeature(data={**text_inputs, **image_inputs}) + return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors) def _get_number_of_features(self, orig_height: int, orig_width: int, height: int, width: int) -> int: image_grid_pinpoints = self.image_processor.image_grid_pinpoints diff --git a/src/transformers/models/llava_next_video/processing_llava_next_video.py b/src/transformers/models/llava_next_video/processing_llava_next_video.py index 43b4102b96..c7ff0a1d7a 100644 --- a/src/transformers/models/llava_next_video/processing_llava_next_video.py +++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py @@ -224,8 +224,11 @@ class LlavaNextVideoProcessor(ProcessorMixin): prompt_strings.append(sample) text = prompt_strings + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"]) - return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}) + self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"]) + + return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors) # Copied from transformers.models.llava_next.processing_llava_next.LlavaNextProcessor._get_number_of_features def _get_number_of_features(self, orig_height: int, orig_width: int, height: int, width: int) -> int: diff --git a/src/transformers/models/llava_onevision/processing_llava_onevision.py b/src/transformers/models/llava_onevision/processing_llava_onevision.py index cb39f09e52..51d8dcf9c0 100644 --- a/src/transformers/models/llava_onevision/processing_llava_onevision.py +++ b/src/transformers/models/llava_onevision/processing_llava_onevision.py @@ -171,7 +171,7 @@ class LlavaOnevisionProcessor(ProcessorMixin): to_numpy_array(image_inputs["pixel_values"][0][0]), channel_dim=output_kwargs["images_kwargs"].get("data_format"), ) - text = self._expand_image_tokens(text, image_sizes, height, width, self.image_token) + text, num_image_tokens = self._expand_image_tokens(text, image_sizes, height, width, self.image_token) if videos is not None: video_inputs = self.video_processor(videos, **output_kwargs["videos_kwargs"]) @@ -188,8 +188,11 @@ class LlavaOnevisionProcessor(ProcessorMixin): num_video_tokens = (num_frames * pooled_height_width * pooled_height_width) + 1 # +1 for newline token text = [sample.replace(self.video_token, self.video_token * num_video_tokens) for sample in text] + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"]) - return BatchFeature(data={**text_inputs, **image_inputs, **video_inputs}) + self._check_special_mm_tokens(text, text_inputs, modalities=["image"]) + + return BatchFeature(data={**text_inputs, **image_inputs, **video_inputs}, tensor_type=return_tensors) def _expand_image_tokens( self, @@ -201,6 +204,7 @@ class LlavaOnevisionProcessor(ProcessorMixin): num_frames: int = 1, ): prompt_strings = [] + max_num_vision_tokens = 0 for sample in text: while special_token in sample: image_size_list = next(image_sizes) @@ -210,12 +214,13 @@ class LlavaOnevisionProcessor(ProcessorMixin): original_size = original_size.tolist() orig_height, orig_width = original_size num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width) + max_num_vision_tokens = max(max_num_vision_tokens, num_image_tokens) if self.vision_feature_select_strategy == "default": num_image_tokens -= 1 sample = sample.replace(special_token, "" * num_image_tokens * num_frames, 1) prompt_strings.append(sample) text = [sample.replace("", special_token) for sample in prompt_strings] - return text + return text, max_num_vision_tokens def _get_number_of_features(self, orig_height: int, orig_width: int, height: int, width: int) -> int: image_grid_pinpoints = self.image_processor.image_grid_pinpoints diff --git a/src/transformers/models/mllama/processing_mllama.py b/src/transformers/models/mllama/processing_mllama.py index d26d93bc3c..ad2ff2ddb8 100644 --- a/src/transformers/models/mllama/processing_mllama.py +++ b/src/transformers/models/mllama/processing_mllama.py @@ -274,6 +274,7 @@ class MllamaProcessor(ProcessorMixin): ) text_kwargs = output_kwargs["text_kwargs"] + text_kwargs["return_tensors"] = None images_kwargs = output_kwargs["images_kwargs"] common_kwargs = output_kwargs["common_kwargs"] @@ -287,6 +288,8 @@ class MllamaProcessor(ProcessorMixin): text = [build_string_from_input(text_item, self.bos_token, self.image_token) for text_item in text] _ = text_kwargs.pop("padding_side", None) # hack until padding-side is an accepted kwarg by tokenizers encoding = self.tokenizer(text, **text_kwargs) + self._check_special_mm_tokens(text, encoding, modalities=["image"]) + n_images_in_ids = [token_ids.count(self.image_token_id) for token_ids in encoding["input_ids"]] data.update(encoding) n_images_in_images = [0] @@ -301,13 +304,18 @@ class MllamaProcessor(ProcessorMixin): raise ValueError( "If a batch of text is provided, there should be either no images or at least one image per sample" ) - if sum(n_images_in_text) > 0 and n_images_in_images != n_images_in_text: + if sum(n_images_in_text) > 0 and ( + n_images_in_images != n_images_in_text or n_images_in_ids != n_images_in_images + ): if images is None: raise ValueError("No image were provided, but there are image tokens in the prompt") else: add_message = "" - if sum(n_images_in_images) == sum(n_images_in_text): + if sum(n_images_in_images) == sum(n_images_in_text) and n_images_in_images != n_images_in_text: add_message = "Make sure to pass your images as a nested list, where each sub-list holds images per batch" + elif n_images_in_ids != n_images_in_images: + add_message = "If you activated truncation with `max_length`, increase the `max_length` so image tokens aren't cropped." + raise ValueError( f"The number of image tokens in each text ({n_images_in_text}) should be the same as the " f"number of provided images per batch ({n_images_in_images}). {add_message}" diff --git a/src/transformers/models/paligemma/processing_paligemma.py b/src/transformers/models/paligemma/processing_paligemma.py index f389487c2b..5048f0c3ee 100644 --- a/src/transformers/models/paligemma/processing_paligemma.py +++ b/src/transformers/models/paligemma/processing_paligemma.py @@ -298,23 +298,21 @@ class PaliGemmaProcessor(ProcessorMixin): suffix = [sfx + self.tokenizer.eos_token for sfx in suffix] pixel_values = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"] - # max_length has to account for the image tokens - if output_kwargs["text_kwargs"].get("max_length", None) is not None: - output_kwargs["text_kwargs"]["max_length"] += self.image_seq_length - + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) inputs = self.tokenizer( input_strings, text_pair=suffix, return_token_type_ids=return_token_type_ids, **output_kwargs["text_kwargs"], ) + self._check_special_mm_tokens(input_strings, inputs, modalities=["image"]) return_data = {**inputs, "pixel_values": pixel_values} if return_token_type_ids: labels = inputs["input_ids"].masked_fill(inputs["token_type_ids"] == 0, -100) return_data.update({"labels": labels}) - return BatchFeature(data=return_data) + return BatchFeature(data=return_data, tensor_type=return_tensors) # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Gemma def batch_decode(self, *args, **kwargs): diff --git a/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py b/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py index f853865f7d..a0d5a75a59 100644 --- a/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py +++ b/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py @@ -71,6 +71,10 @@ class Phi4MultimodalProcessor(ProcessorMixin): tokenizer, **kwargs, ): + self.image_token = tokenizer.image_token + self.image_token_id = tokenizer.image_token_id + self.audio_token = tokenizer.audio_token + self.audio_token_id = tokenizer.audio_token_id super().__init__(image_processor, audio_processor, tokenizer, **kwargs) def __call__( @@ -113,7 +117,6 @@ class Phi4MultimodalProcessor(ProcessorMixin): output_kwargs = self._merge_kwargs(Phi4MultimodalProcessorKwargs, self.tokenizer.init_kwargs, **kwargs) image_kwargs = output_kwargs["images_kwargs"] audio_kwargs = output_kwargs["audio_kwargs"] - text_kwargs = output_kwargs["text_kwargs"] image_inputs = self.image_processor(images, **image_kwargs) if images is not None else {} audio_inputs = self.audio_processor(audio, **audio_kwargs) if audio is not None else {} @@ -154,7 +157,9 @@ class Phi4MultimodalProcessor(ProcessorMixin): re.sub(re.escape(audio_token), lambda _: audio_token * next(audio_count_iter), t) for t in processed_text ] - text_inputs = self.tokenizer(processed_text, **text_kwargs) + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) + text_inputs = self.tokenizer(processed_text, **output_kwargs["text_kwargs"]) + self._check_special_mm_tokens(processed_text, text_inputs, modalities=["image"]) # prepare batch feature data = { @@ -163,7 +168,7 @@ class Phi4MultimodalProcessor(ProcessorMixin): **audio_inputs, } - return BatchFeature(data=data) + return BatchFeature(data=data, tensor_type=return_tensors) def batch_decode(self, *args, **kwargs): """ diff --git a/src/transformers/models/pixtral/processing_pixtral.py b/src/transformers/models/pixtral/processing_pixtral.py index 1a542add69..4531c56b5a 100644 --- a/src/transformers/models/pixtral/processing_pixtral.py +++ b/src/transformers/models/pixtral/processing_pixtral.py @@ -103,6 +103,7 @@ class PixtralProcessor(ProcessorMixin): self.patch_size = patch_size self.spatial_merge_size = spatial_merge_size self.image_token = image_token + self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token) self.image_break_token = image_break_token self.image_end_token = image_end_token super().__init__(image_processor, tokenizer, chat_template=chat_template) @@ -211,10 +212,10 @@ class PixtralProcessor(ProcessorMixin): sample = sample.replace("", replace_str, 1) prompt_strings.append(sample) + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"]) - return BatchFeature( - data={**text_inputs, **image_inputs}, tensor_type=output_kwargs["common_kwargs"]["return_tensors"] - ) + self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"]) + return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors) # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama def batch_decode(self, *args, **kwargs): diff --git a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py index fa245c45f5..b47db20862 100644 --- a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py @@ -941,16 +941,14 @@ class Qwen2_5_VLProcessor(Qwen2VLProcessor): if not isinstance(text, list): text = [text] + text = text.copy() # below lines change text in-place if image_grid_thw is not None: merge_length = self.image_processor.merge_size**2 index = 0 for i in range(len(text)): while self.image_token in text[i]: - text[i] = text[i].replace( - self.image_token, - "<|placeholder|>" * (image_grid_thw[index].prod() // merge_length), - 1, - ) + num_image_tokens = image_grid_thw[index].prod() // merge_length + text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1) index += 1 text[i] = text[i].replace("<|placeholder|>", self.image_token) @@ -959,17 +957,16 @@ class Qwen2_5_VLProcessor(Qwen2VLProcessor): index = 0 for i in range(len(text)): while self.video_token in text[i]: - text[i] = text[i].replace( - self.video_token, - "<|placeholder|>" * (video_grid_thw[index].prod() // merge_length), - 1, - ) + num_video_tokens = video_grid_thw[index].prod() // merge_length + text[i] = text[i].replace(self.video_token, "<|placeholder|>" * num_video_tokens, 1) index += 1 text[i] = text[i].replace("<|placeholder|>", self.video_token) + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"]) + self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"]) - return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}) + return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors) __all__ = [ diff --git a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py index e07642a1bf..d099432383 100644 --- a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py +++ b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py @@ -77,6 +77,16 @@ class Qwen2_5_VLProcessor(ProcessorMixin): def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs): self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token + self.image_token_id = ( + tokenizer.image_token_id + if getattr(tokenizer, "image_token_id", None) + else tokenizer.convert_tokens_to_ids(self.image_token) + ) + self.video_token_id = ( + tokenizer.video_token_id + if getattr(tokenizer, "video_token_id", None) + else tokenizer.convert_tokens_to_ids(self.video_token) + ) super().__init__(image_processor, tokenizer, chat_template=chat_template) def __call__( @@ -157,16 +167,14 @@ class Qwen2_5_VLProcessor(ProcessorMixin): if not isinstance(text, list): text = [text] + text = text.copy() # below lines change text in-place if image_grid_thw is not None: merge_length = self.image_processor.merge_size**2 index = 0 for i in range(len(text)): while self.image_token in text[i]: - text[i] = text[i].replace( - self.image_token, - "<|placeholder|>" * (image_grid_thw[index].prod() // merge_length), - 1, - ) + num_image_tokens = image_grid_thw[index].prod() // merge_length + text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1) index += 1 text[i] = text[i].replace("<|placeholder|>", self.image_token) @@ -175,17 +183,16 @@ class Qwen2_5_VLProcessor(ProcessorMixin): index = 0 for i in range(len(text)): while self.video_token in text[i]: - text[i] = text[i].replace( - self.video_token, - "<|placeholder|>" * (video_grid_thw[index].prod() // merge_length), - 1, - ) + num_video_tokens = video_grid_thw[index].prod() // merge_length + text[i] = text[i].replace(self.video_token, "<|placeholder|>" * num_video_tokens, 1) index += 1 text[i] = text[i].replace("<|placeholder|>", self.video_token) + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"]) + self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"]) - return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}) + return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors) def batch_decode(self, *args, **kwargs): """ diff --git a/src/transformers/models/qwen2_audio/processing_qwen2_audio.py b/src/transformers/models/qwen2_audio/processing_qwen2_audio.py index 0daa90c564..0be2428f2e 100644 --- a/src/transformers/models/qwen2_audio/processing_qwen2_audio.py +++ b/src/transformers/models/qwen2_audio/processing_qwen2_audio.py @@ -76,6 +76,7 @@ class Qwen2AudioProcessor(ProcessorMixin): if chat_template is None: chat_template = self.default_chat_template self.audio_token = tokenizer.audio_token if hasattr(tokenizer, "audio_token") else audio_token + self.audio_token_id = tokenizer.convert_tokens_to_ids(self.audio_token) self.audio_bos_token = tokenizer.audio_bos_token if hasattr(tokenizer, "audio_bos_token") else audio_bos_token self.audio_eos_token = tokenizer.audio_eos_token if hasattr(tokenizer, "audio_eos_token") else audio_eos_token super().__init__(feature_extractor, tokenizer, chat_template=chat_template) @@ -179,12 +180,14 @@ class Qwen2AudioProcessor(ProcessorMixin): expanded_text.append(sample) text = expanded_text + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) inputs = self.tokenizer(text, **output_kwargs["text_kwargs"]) + self._check_special_mm_tokens(text, inputs, modalities=["audio"]) if audio is not None: inputs.update(audio_inputs) - return BatchFeature(data={**inputs}) + return BatchFeature(data={**inputs}, tensor_type=return_tensors) def batch_decode(self, *args, **kwargs): """ diff --git a/src/transformers/models/qwen2_vl/processing_qwen2_vl.py b/src/transformers/models/qwen2_vl/processing_qwen2_vl.py index 06b0adb0fb..241d63b029 100644 --- a/src/transformers/models/qwen2_vl/processing_qwen2_vl.py +++ b/src/transformers/models/qwen2_vl/processing_qwen2_vl.py @@ -72,6 +72,16 @@ class Qwen2VLProcessor(ProcessorMixin): def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs): self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token + self.image_token_id = ( + tokenizer.image_token_id + if getattr(tokenizer, "image_token_id", None) + else tokenizer.convert_tokens_to_ids(self.image_token) + ) + self.video_token_id = ( + tokenizer.video_token_id + if getattr(tokenizer, "video_token_id", None) + else tokenizer.convert_tokens_to_ids(self.video_token) + ) super().__init__(image_processor, tokenizer, chat_template=chat_template) def __call__( @@ -139,14 +149,15 @@ class Qwen2VLProcessor(ProcessorMixin): if not isinstance(text, list): text = [text] + text = text.copy() # below lines change text in-place + if image_grid_thw is not None: merge_length = self.image_processor.merge_size**2 index = 0 for i in range(len(text)): while self.image_token in text[i]: - text[i] = text[i].replace( - self.image_token, "<|placeholder|>" * (image_grid_thw[index].prod() // merge_length), 1 - ) + num_image_tokens = image_grid_thw[index].prod() // merge_length + text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1) index += 1 text[i] = text[i].replace("<|placeholder|>", self.image_token) @@ -155,15 +166,15 @@ class Qwen2VLProcessor(ProcessorMixin): index = 0 for i in range(len(text)): while self.video_token in text[i]: - text[i] = text[i].replace( - self.video_token, "<|placeholder|>" * (video_grid_thw[index].prod() // merge_length), 1 - ) + num_video_tokens = video_grid_thw[index].prod() // merge_length + text[i] = text[i].replace(self.video_token, "<|placeholder|>" * num_video_tokens, 1) index += 1 text[i] = text[i].replace("<|placeholder|>", self.video_token) + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"]) - - return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}) + self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"]) + return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors) def batch_decode(self, *args, **kwargs): """ diff --git a/src/transformers/models/smolvlm/processing_smolvlm.py b/src/transformers/models/smolvlm/processing_smolvlm.py index 615e3104d6..3a2357a4cc 100644 --- a/src/transformers/models/smolvlm/processing_smolvlm.py +++ b/src/transformers/models/smolvlm/processing_smolvlm.py @@ -149,6 +149,7 @@ class SmolVLMProcessor(ProcessorMixin): ): self.fake_image_token = getattr(tokenizer, "fake_image_token", "") self.image_token = getattr(tokenizer, "image_token", "") + self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token) self.end_of_utterance_token = getattr(tokenizer, "end_of_utterance_token", "") self.global_image_token = getattr(tokenizer, "global_image_token", "") self.image_seq_len = image_seq_len @@ -290,7 +291,7 @@ class SmolVLMProcessor(ProcessorMixin): if n_images_in_text > 0 and (images is None and videos is None): raise ValueError(f"We detected {n_images_in_text} tokens in the text but no images/videos were passed") - inputs = BatchFeature() + inputs = {} # Images and videos are mutually exclusive, so process one which is present if images is not None: images = make_nested_list_of_images(images) @@ -313,11 +314,14 @@ class SmolVLMProcessor(ProcessorMixin): ) inputs.update(vision_inputs) + return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None) + if text is not None: - text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"]) + text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"]) + self._check_special_mm_tokens(text, text_inputs, modalities=["image"]) inputs.update(text_inputs) - return inputs + return BatchFeature(inputs, tensor_type=return_tensors) def _process_messages_for_chat_template( self, diff --git a/src/transformers/models/video_llava/processing_video_llava.py b/src/transformers/models/video_llava/processing_video_llava.py index 4806720b37..a49bb1b225 100644 --- a/src/transformers/models/video_llava/processing_video_llava.py +++ b/src/transformers/models/video_llava/processing_video_llava.py @@ -87,6 +87,8 @@ class VideoLlavaProcessor(ProcessorMixin): self.vision_feature_select_strategy = vision_feature_select_strategy self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token self.video_token = tokenizer.video_token if hasattr(tokenizer, "video_token") else video_token + self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token) + self.video_token_id = tokenizer.convert_tokens_to_ids(self.video_token) super().__init__(image_processor, tokenizer, chat_template=chat_template) def __call__( @@ -195,14 +197,16 @@ class VideoLlavaProcessor(ProcessorMixin): text_inputs = self.tokenizer( prompt_strings, - return_tensors=return_tensors, + return_tensors=None, padding=padding, truncation=truncation, max_length=max_length, ) + self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image", "video"]) + data.update(text_inputs) - return BatchFeature(data=data) + return BatchFeature(data=data, tensor_type=return_tensors) # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama def batch_decode(self, *args, **kwargs): diff --git a/src/transformers/processing_utils.py b/src/transformers/processing_utils.py index b650972fb4..dc6bd4bf14 100644 --- a/src/transformers/processing_utils.py +++ b/src/transformers/processing_utils.py @@ -31,6 +31,7 @@ from huggingface_hub.errors import EntryNotFoundError from .audio_utils import load_audio from .dynamic_module_utils import custom_object_save +from .feature_extraction_utils import BatchFeature from .image_utils import ( ChannelDimension, ImageInput, @@ -1615,6 +1616,23 @@ class ProcessorMixin(PushToHubMixin): """ return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=skip_special_tokens, **kwargs) + def _check_special_mm_tokens(self, text: list[str], text_inputs: "BatchFeature", modalities: list[str]): + """ + Checks that number of special tokens in text and processed text is same. The count can be different + if tokenized text was truncated, leading to issues in model code. + """ + for modality in modalities: + token_str = getattr(self, f"{modality}_token") + token_id = getattr(self, f"{modality}_token_id") + ids_count = [list(ids).count(token_id) for ids in text_inputs["input_ids"]] + text_count = [sample.count(token_str) for sample in text] + + if ids_count != text_count: + raise ValueError( + f"Mismatch in `{modality}` token count between text and `input_ids`. Got ids={ids_count} and text={text_count}. " + "Likely due to `truncation='max_length'`. Please disable truncation or increase `max_length`." + ) + def _validate_images_text_input_order(images, text): """ diff --git a/tests/models/aria/test_processor_aria.py b/tests/models/aria/test_processor_aria.py index 08a6c5ba78..9df833661a 100644 --- a/tests/models/aria/test_processor_aria.py +++ b/tests/models/aria/test_processor_aria.py @@ -271,3 +271,29 @@ And who is that?<|im_end|> return_tensors="np", ) self.assertListEqual(list(out_dict[self.images_input_name].shape), [1, 3, 980, 980]) + + def test_special_mm_token_truncation(self): + """Tests that special vision tokens do not get truncated when `truncation=True` is set.""" + + processor = self.get_processor() + + input_str = self.prepare_text_inputs(batch_size=2, modality="image") + image_input = self.prepare_image_inputs(batch_size=2) + + _ = processor( + text=input_str, + images=image_input, + return_tensors="pt", + truncation=None, + padding=True, + ) + + with self.assertRaises(ValueError): + _ = processor( + text=input_str, + images=image_input, + return_tensors="pt", + truncation=True, + padding=True, + max_length=3, + ) diff --git a/tests/models/chameleon/test_processor_chameleon.py b/tests/models/chameleon/test_processor_chameleon.py index 890b1f7f69..d11321c9a8 100644 --- a/tests/models/chameleon/test_processor_chameleon.py +++ b/tests/models/chameleon/test_processor_chameleon.py @@ -40,10 +40,37 @@ class ChameleonProcessorTest(ProcessorTesterMixin, unittest.TestCase): tokenizer = LlamaTokenizer(vocab_file=SAMPLE_VOCAB) tokenizer.pad_token_id = 0 tokenizer.sep_token_id = 1 + tokenizer.add_special_tokens({"additional_special_tokens": [""]}) processor = cls.processor_class(image_processor=image_processor, tokenizer=tokenizer, image_seq_length=2) processor.save_pretrained(cls.tmpdirname) cls.image_token = processor.image_token + def test_special_mm_token_truncation(self): + """Tests that special vision tokens do not get truncated when `truncation=True` is set.""" + + processor = self.get_processor() + + input_str = self.prepare_text_inputs(batch_size=2, modality="image") + image_input = self.prepare_image_inputs(batch_size=2) + + _ = processor( + text=input_str, + images=image_input, + return_tensors="pt", + truncation=None, + padding=True, + ) + + with self.assertRaises(ValueError): + _ = processor( + text=input_str, + images=image_input, + return_tensors="pt", + truncation=True, + padding=True, + max_length=20, + ) + @staticmethod def prepare_processor_dict(): return {"image_seq_length": 2} # fmt: skip diff --git a/tests/models/gemma3/test_processing_gemma3.py b/tests/models/gemma3/test_processing_gemma3.py index a2290c9928..968a852d64 100644 --- a/tests/models/gemma3/test_processing_gemma3.py +++ b/tests/models/gemma3/test_processing_gemma3.py @@ -124,3 +124,28 @@ class Gemma3ProcessorTest(ProcessorTesterMixin, unittest.TestCase): # base image + 4 crops self.assertEqual(len(inputs[self.images_input_name]), 5) self.assertEqual(len(inputs[self.text_input_name][0]), 67) + + def test_special_mm_token_truncation(self): + """Tests that special vision tokens do not get truncated when `truncation=True` is set.""" + + processor = self.get_processor() + + input_str = self.prepare_text_inputs(batch_size=2, modality="image") + image_input = self.prepare_image_inputs(batch_size=2) + _ = processor( + text=input_str, + images=image_input, + return_tensors="pt", + truncation=None, + padding=True, + ) + + with self.assertRaises(ValueError): + _ = processor( + text=input_str, + images=image_input, + return_tensors="pt", + truncation=True, + padding=True, + max_length=5, + ) diff --git a/tests/models/idefics2/test_processor_idefics2.py b/tests/models/idefics2/test_processor_idefics2.py index f2f06af707..a39d14d4f1 100644 --- a/tests/models/idefics2/test_processor_idefics2.py +++ b/tests/models/idefics2/test_processor_idefics2.py @@ -66,8 +66,8 @@ class Idefics2ProcessorTest(ProcessorTesterMixin, unittest.TestCase): ) ) cls.bos_token = processor.tokenizer.bos_token - cls.image_token = processor.image_token.content - cls.fake_image_token = processor.fake_image_token.content + cls.image_token = processor.image_token + cls.fake_image_token = processor.fake_image_token cls.bos_token_id = processor.tokenizer.convert_tokens_to_ids(cls.bos_token) cls.image_token_id = processor.tokenizer.convert_tokens_to_ids(cls.image_token) diff --git a/tests/models/idefics3/test_processor_idefics3.py b/tests/models/idefics3/test_processor_idefics3.py index ad8a24a5a1..99b931a12c 100644 --- a/tests/models/idefics3/test_processor_idefics3.py +++ b/tests/models/idefics3/test_processor_idefics3.py @@ -60,8 +60,8 @@ class Idefics3ProcessorTest(ProcessorTesterMixin, unittest.TestCase): ) ) cls.bos_token = processor.tokenizer.bos_token - cls.image_token = processor.image_token.content - cls.fake_image_token = processor.fake_image_token.content + cls.image_token = processor.image_token + cls.fake_image_token = processor.fake_image_token cls.global_img_token = processor.global_image_tag cls.bos_token_id = processor.tokenizer.convert_tokens_to_ids(cls.bos_token) diff --git a/tests/models/llava/test_processor_llava.py b/tests/models/llava/test_processor_llava.py index 3a469d76f2..51ed955b84 100644 --- a/tests/models/llava/test_processor_llava.py +++ b/tests/models/llava/test_processor_llava.py @@ -40,6 +40,7 @@ class LlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase): image_processor = CLIPImageProcessor(do_center_crop=False) tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b") + tokenizer.add_special_tokens({"additional_special_tokens": [""]}) processor_kwargs = cls.prepare_processor_dict() processor = LlavaProcessor(image_processor, tokenizer, **processor_kwargs) processor.save_pretrained(cls.tmpdirname) @@ -79,3 +80,29 @@ class LlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase): processor = LlavaProcessor.from_pretrained(checkpoint) tokenizer = AutoTokenizer.from_pretrained(checkpoint) self.assertEqual(processor.tokenizer.__class__, tokenizer.__class__) + + def test_special_mm_token_truncation(self): + """Tests that special vision tokens do not get truncated when `truncation=True` is set.""" + + processor = LlavaProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf") + + input_str = self.prepare_text_inputs(batch_size=2, modality="image") + image_input = self.prepare_image_inputs(batch_size=2) + + _ = processor( + text=input_str, + images=image_input, + return_tensors="pt", + truncation=None, + padding=True, + ) + + with self.assertRaises(ValueError): + _ = processor( + text=input_str, + images=image_input, + return_tensors="pt", + truncation=True, + padding=True, + max_length=5, + ) diff --git a/tests/models/llava_next/test_processor_llava_next.py b/tests/models/llava_next/test_processor_llava_next.py index 47fbb241ac..a565212b49 100644 --- a/tests/models/llava_next/test_processor_llava_next.py +++ b/tests/models/llava_next/test_processor_llava_next.py @@ -40,6 +40,7 @@ class LlavaNextProcessorTest(ProcessorTesterMixin, unittest.TestCase): image_processor = LlavaNextImageProcessor() tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b") + tokenizer.add_special_tokens({"additional_special_tokens": [""]}) processor_kwargs = cls.prepare_processor_dict() processor = LlavaNextProcessor(image_processor, tokenizer, **processor_kwargs) processor.save_pretrained(cls.tmpdirname) diff --git a/tests/models/llava_next_video/test_processor_llava_next_video.py b/tests/models/llava_next_video/test_processor_llava_next_video.py index 207d1a6372..d985e79aea 100644 --- a/tests/models/llava_next_video/test_processor_llava_next_video.py +++ b/tests/models/llava_next_video/test_processor_llava_next_video.py @@ -41,6 +41,7 @@ class LlavaNextVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase): image_processor = LlavaNextImageProcessor() video_processor = LlavaNextVideoImageProcessor() tokenizer = LlamaTokenizerFast.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf") + tokenizer.add_special_tokens({"additional_special_tokens": ["", "