[vlm] adjust max length for special tokens (#37342)

* update * apply suggestion * fix tests for main branch * remove unused logger * add special tokens in tests * nit * fix more tests * fix test * pg also
2025-04-16 20:49:20 +02:00
parent c94c59fc47
commit 32eca7197a
39 changed files with 414 additions and 98 deletions
--- a/src/transformers/models/aria/modular_aria.py
+++ b/src/transformers/models/aria/modular_aria.py
@@ -946,6 +946,8 @@ class AriaProcessor(ProcessorMixin):
            size_conversion = {490: 128, 980: 256}
        self.size_conversion = {int(k): v for k, v in size_conversion.items()}
        self.image_token = tokenizer.image_token
        self.image_token_id = tokenizer.image_token_id
        if tokenizer is not None and tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.unk_token
@@ -986,10 +988,12 @@ class AriaProcessor(ProcessorMixin):
            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
            **kwargs,
        )
        if isinstance(text, str):
            text = [text]
        elif not isinstance(text, list) and not isinstance(text[0], str):
            raise ValueError("Invalid input text. Please provide a string, or a list of strings")
        if images is not None:
            image_inputs = self.image_processor(
                images,
@@ -1007,12 +1011,11 @@ class AriaProcessor(ProcessorMixin):
            image_inputs = {}
            prompt_strings = text
-        text_inputs = self.tokenizer(
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
-            prompt_strings,
+        text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
-            **output_kwargs["text_kwargs"],
+        self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"])
        )
-        return BatchFeature(data={**text_inputs, **image_inputs})
+        return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
    def batch_decode(self, *args, **kwargs):
        """
--- a/src/transformers/models/aria/processing_aria.py
+++ b/src/transformers/models/aria/processing_aria.py
@@ -72,6 +72,8 @@ class AriaProcessor(ProcessorMixin):
            size_conversion = {490: 128, 980: 256}
        self.size_conversion = {int(k): v for k, v in size_conversion.items()}
        self.image_token = tokenizer.image_token
        self.image_token_id = tokenizer.image_token_id
        if tokenizer is not None and tokenizer.pad_token is None:
            tokenizer.pad_token = tokenizer.unk_token
@@ -112,10 +114,12 @@ class AriaProcessor(ProcessorMixin):
            tokenizer_init_kwargs=self.tokenizer.init_kwargs,
            **kwargs,
        )
        if isinstance(text, str):
            text = [text]
        elif not isinstance(text, list) and not isinstance(text[0], str):
            raise ValueError("Invalid input text. Please provide a string, or a list of strings")
        if images is not None:
            image_inputs = self.image_processor(
                images,
@@ -133,12 +137,11 @@ class AriaProcessor(ProcessorMixin):
            image_inputs = {}
            prompt_strings = text
-        text_inputs = self.tokenizer(
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
-            prompt_strings,
+        text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
-            **output_kwargs["text_kwargs"],
+        self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"])
        )
-        return BatchFeature(data={**text_inputs, **image_inputs})
+        return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
    def batch_decode(self, *args, **kwargs):
        """
--- a/src/transformers/models/aya_vision/processing_aya_vision.py
+++ b/src/transformers/models/aya_vision/processing_aya_vision.py
@@ -121,6 +121,7 @@ class AyaVisionProcessor(ProcessorMixin):
        super().__init__(image_processor, tokenizer, chat_template=chat_template)
        self.image_token = image_token
        self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
        self.patch_size = patch_size * downsample_factor
        self.img_size = img_size
@@ -224,9 +225,11 @@ class AyaVisionProcessor(ProcessorMixin):
            text = processed_text
        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
        self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
-        return BatchFeature(data={**text_inputs, **image_inputs})
+        return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
    def batch_decode(self, *args, **kwargs):
        """
--- a/src/transformers/models/chameleon/processing_chameleon.py
+++ b/src/transformers/models/chameleon/processing_chameleon.py
@@ -68,6 +68,7 @@ class ChameleonProcessor(ProcessorMixin):
    def __init__(self, image_processor, tokenizer, image_seq_length: int = 1024, image_token: str = "<image>"):
        self.image_seq_length = image_seq_length
        self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
        self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
        self.image_start_token = (
            tokenizer.boi_token if hasattr(tokenizer, "boi_token") else "<racm3:break>"
        )  # fixed tokens for start and end, so can hardcode
@@ -140,12 +141,14 @@ class ChameleonProcessor(ProcessorMixin):
                sample += self.tokenizer.sep_token  # special Chameleon treatment to add sep for chat mode
            prompt_strings.append(sample)
        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
        data = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
        self._check_special_mm_tokens(prompt_strings, data, modalities=["image"])
        if images is not None:
            data["pixel_values"] = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"]
-        return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"]["return_tensors"])
+        return BatchFeature(data=data, tensor_type=return_tensors)
    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
    def batch_decode(self, *args, **kwargs):
--- a/src/transformers/models/emu3/processing_emu3.py
+++ b/src/transformers/models/emu3/processing_emu3.py
@@ -75,6 +75,7 @@ class Emu3Processor(ProcessorMixin):
        **kwargs,
    ):
        self.image_token = tokenizer.image_token  # image_token as placeholder to be replaced by vq-vae tokens
        self.image_token_id = tokenizer.image_token_id
        self.image_start_token = tokenizer.boi_token  # "<|image start|>" fixed tokens for start and end of image
        self.image_end_token = tokenizer.eoi_token  # "<|image end|>"
        self.fake_token_around_image = tokenizer.image_wrapper_token  # "<|image token|>"  every image starts with it
@@ -177,10 +178,13 @@ class Emu3Processor(ProcessorMixin):
            image_features["image_sizes"] = [[height, width]] * len(text)
        # else just generate from text-only input, and we do no special treatment for text
        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
        data = self.tokenizer(text, **output_kwargs["text_kwargs"])
        self._check_special_mm_tokens(text, data, modalities=["image"])
        data.update(**image_features)
-        return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].pop("return_tensors", None))
+        return BatchFeature(data=data, tensor_type=return_tensors)
    def calculate_generate_size(self, ratio, image_area, spatial_factor):
        width, height = map(int, ratio.split(":"))
--- a/src/transformers/models/gemma3/processing_gemma3.py
+++ b/src/transformers/models/gemma3/processing_gemma3.py
@@ -65,7 +65,7 @@ class Gemma3Processor(ProcessorMixin):
        self.image_seq_length = image_seq_length
        self.image_token_id = tokenizer.image_token_id
        self.boi_token = tokenizer.boi_token
-        self.image_token = tokenizer.boi_token
+        self.image_token = tokenizer.image_token
        image_tokens_expanded = "".join([tokenizer.image_token] * image_seq_length)
        self.full_image_sequence = f"\n\n{tokenizer.boi_token}{image_tokens_expanded}{tokenizer.eoi_token}\n\n"
@@ -138,6 +138,7 @@ class Gemma3Processor(ProcessorMixin):
        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
        text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"], return_tensors="np")
        self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
        # Add token type ids manually, as tokenizer can't do arbitrary position token types
        array_ids = text_inputs["input_ids"]
--- a/src/transformers/models/got_ocr2/processing_got_ocr2.py
+++ b/src/transformers/models/got_ocr2/processing_got_ocr2.py
@@ -107,6 +107,8 @@ class GotOcr2Processor(ProcessorMixin):
        self.img_start_token = "<img>"
        self.img_end_token = "</img>"
        self.img_pad_token = "<imgpad>"
        self.image_token = "<imgpad>"  # keep the above for BC, but we need to call it `image_token`
        self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
        self.system_query = "system\nYou should follow the instructions carefully and explain your answers in detail."
    def _make_list_of_inputs(self, images, text, box, color, multi_page):
@@ -250,8 +252,11 @@ class GotOcr2Processor(ProcessorMixin):
                )
                text.append(prompt)
        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
-        return BatchFeature(data={**text_inputs, **image_inputs})
+        self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
        return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
    def batch_decode(self, *args, **kwargs):
        """
--- a/src/transformers/models/idefics2/processing_idefics2.py
+++ b/src/transformers/models/idefics2/processing_idefics2.py
@@ -98,13 +98,15 @@ class Idefics2Processor(ProcessorMixin):
            raise ValueError("You need to specify a `tokenizer`.")
        if not hasattr(tokenizer, "image_token"):
-            self.fake_image_token = AddedToken("<fake_token_around_image>", normalized=False, special=True)
+            self.fake_image_token = AddedToken("<fake_token_around_image>", normalized=False, special=True).content
-            self.image_token = AddedToken("<image>", normalized=False, special=True)
+            self.image_token = AddedToken("<image>", normalized=False, special=True).content
            tokens_to_add = {"additional_special_tokens": [self.fake_image_token, self.image_token]}
            tokenizer.add_special_tokens(tokens_to_add)
            self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
        else:
            self.fake_image_token = tokenizer.image_boundary_token
            self.image_token = tokenizer.image_token
            self.image_token_id = tokenizer.image_token_id
        self.end_of_utterance_token = AddedToken("<end_of_utterance>", normalized=False, special=True)
        tokenizer.add_special_tokens({"additional_special_tokens": [self.end_of_utterance_token]})
@@ -190,9 +192,10 @@ class Idefics2Processor(ProcessorMixin):
        )
        image_seq_len = output_kwargs["images_kwargs"].pop("image_seq_len", None)
        image_seq_len = image_seq_len if image_seq_len is not None else self.image_seq_len
        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
        n_images_in_text = []
-        inputs = BatchFeature()
+        inputs = {}
        if text is not None:
            if isinstance(text, str):
@@ -201,13 +204,14 @@ class Idefics2Processor(ProcessorMixin):
                raise ValueError("Invalid input text. Please provide a string, or a list of strings")
            # Replace the image token with fake tokens around the expanded image token sequence of length `image_seq_len`
-            fake_image_token = self.fake_image_token.content
+            fake_image_token = self.fake_image_token
-            image_token = self.image_token.content
+            image_token = self.image_token
            image_str = f"{fake_image_token}{image_token * image_seq_len}{fake_image_token}"
            if self.image_processor.do_image_splitting:
                # A single image token is split into 4 patches + 1 original image
                image_str = image_str * 5
                image_seq_len *= 5
            prompt_strings = []
            for sample in text:
@@ -218,6 +222,7 @@ class Idefics2Processor(ProcessorMixin):
                prompt_strings.append(sample)
            text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
            self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"])
            inputs.update(text_inputs)
        if images is not None:
@@ -259,7 +264,7 @@ class Idefics2Processor(ProcessorMixin):
            image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
            inputs.update(image_inputs)
-        return inputs
+        return BatchFeature(inputs, tensor_type=return_tensors)
    def batch_decode(self, *args, **kwargs):
        """
--- a/src/transformers/models/idefics3/processing_idefics3.py
+++ b/src/transformers/models/idefics3/processing_idefics3.py
@@ -141,9 +141,9 @@ class Idefics3Processor(ProcessorMixin):
        if tokenizer is None:
            raise ValueError("You need to specify a `tokenizer`.")
-        self.fake_image_token = AddedToken("<fake_token_around_image>", normalized=False, special=True)
+        self.fake_image_token = AddedToken("<fake_token_around_image>", normalized=False, special=True).content
-        self.image_token = AddedToken("<image>", normalized=False, special=True)
+        self.image_token = AddedToken("<image>", normalized=False, special=True).content
-        self.end_of_utterance_token = AddedToken("<end_of_utterance>", normalized=False, special=True)
+        self.end_of_utterance_token = AddedToken("<end_of_utterance>", normalized=False, special=True).content
        self.global_image_tag = "<global-img>"  # https://github.com/huggingface/transformers/pull/32473/files/8063e5e17362571b693f1db95167f5443a3be1b2#r1734825341
        self.image_seq_len = image_seq_len
@@ -159,6 +159,7 @@ class Idefics3Processor(ProcessorMixin):
            ]
        }
        tokenizer.add_special_tokens(tokens_to_add)
        self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
        super().__init__(image_processor, tokenizer, chat_template=chat_template, **kwargs)
@@ -240,17 +241,18 @@ class Idefics3Processor(ProcessorMixin):
        )
        image_seq_len = image_seq_len if image_seq_len is not None else self.image_seq_len
        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
        n_images_in_text = []
        n_images_in_images = []
-        inputs = BatchFeature()
+        inputs = {}
        if text is not None:
            if isinstance(text, str):
                text = [text]
            elif not isinstance(text, list) and not isinstance(text[0], str):
                raise ValueError("Invalid input text. Please provide a string, or a list of strings")
-            n_images_in_text = [sample.count(self.image_token.content) for sample in text]
+            n_images_in_text = [sample.count(self.image_token) for sample in text]
        if images is not None:
            if is_image_or_image_url(images):
@@ -259,8 +261,8 @@ class Idefics3Processor(ProcessorMixin):
                if text is not None:
                    if sum(n_images_in_text) != len(images):
                        raise ValueError(
-                            f"The total number of {self.image_token.content} tokens in the prompts should be the same as the number of images passed."
+                            f"The total number of {self.image_token} tokens in the prompts should be the same as the number of images passed."
-                            f" Found {sum(n_images_in_text)} {self.image_token.content} tokens and {len(images)} images."
+                            f" Found {sum(n_images_in_text)} {self.image_token} tokens and {len(images)} images."
                        )
                    # Reorganize the images to match the prompts
                    cumsum_images_in_text = [0] + list(accumulate(n_images_in_text))
@@ -295,8 +297,8 @@ class Idefics3Processor(ProcessorMixin):
                image_rows = inputs.pop("rows", [[0] * len(text)])
                image_cols = inputs.pop("cols", [[0] * len(text)])
-                fake_image_token = self.fake_image_token.content
+                fake_image_token = self.fake_image_token
-                image_token = self.image_token.content
+                image_token = self.image_token
                global_img_token = self.global_image_tag
                prompt_strings = []
@@ -324,18 +326,19 @@ class Idefics3Processor(ProcessorMixin):
                        sample += image_prompt_string + split_sample[i + 1]
                    prompt_strings.append(sample)
-                text_inputs = self.tokenizer(text=prompt_strings, **output_kwargs["text_kwargs"])
+                text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
                self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"])
                inputs.update(text_inputs)
        elif text is not None:
            if any(n_images_in_text):
                raise ValueError(
-                    f"Found {sum(n_images_in_text)} {self.image_token.content} tokens in the text but no images were passed."
+                    f"Found {sum(n_images_in_text)} {self.image_token} tokens in the text but no images were passed."
                )
            text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
            inputs.update(text_inputs)
-        return inputs
+        return BatchFeature(inputs, tensor_type=return_tensors)
    def batch_decode(self, *args, **kwargs):
        """
--- a/src/transformers/models/llama4/processing_llama4.py
+++ b/src/transformers/models/llama4/processing_llama4.py
@@ -122,6 +122,7 @@ class Llama4Processor(ProcessorMixin):
        self.fake_image_token = fake_image_token
        self.image_token = image_token
        self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
        self.start_of_img_token = start_of_image_token
        self.end_of_img_token = end_of_image_token
        self.img_patch_token = patch_token
@@ -148,6 +149,7 @@ class Llama4Processor(ProcessorMixin):
                        img_string += "<|tile_x_separator|>"
                img_string += "<|tile_y_separator|>"
        img_string += "<|image|>"
        img_string += "<|patch|>" * num_patches_per_chunk
        img_string += "<|image_end|>"
@@ -247,9 +249,11 @@ class Llama4Processor(ProcessorMixin):
            text = processed_text
        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
        self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
-        return BatchFeature(data={**text_inputs, **image_inputs})
+        return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
    def batch_decode(self, *args, **kwargs):
        """
--- a/src/transformers/models/llava/processing_llava.py
+++ b/src/transformers/models/llava/processing_llava.py
@@ -173,8 +173,10 @@ class LlavaProcessor(ProcessorMixin):
                sample = sample.replace(self.image_token, self.image_token * num_image_tokens)
                prompt_strings.append(sample)
        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
        text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
-        return BatchFeature(data={**text_inputs, **image_inputs})
+        self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"])
        return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
    def batch_decode(self, *args, **kwargs):
--- a/src/transformers/models/llava_next/processing_llava_next.py
+++ b/src/transformers/models/llava_next/processing_llava_next.py
@@ -171,9 +171,11 @@ class LlavaNextProcessor(ProcessorMixin):
                prompt_strings.append(sample)
            prompt_strings = [sample.replace("<placeholder>", self.image_token) for sample in prompt_strings]
        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
        text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
        self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"])
-        return BatchFeature(data={**text_inputs, **image_inputs})
+        return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
    def _get_number_of_features(self, orig_height: int, orig_width: int, height: int, width: int) -> int:
        image_grid_pinpoints = self.image_processor.image_grid_pinpoints
--- a/src/transformers/models/llava_next_video/processing_llava_next_video.py
+++ b/src/transformers/models/llava_next_video/processing_llava_next_video.py
@@ -224,8 +224,11 @@ class LlavaNextVideoProcessor(ProcessorMixin):
                prompt_strings.append(sample)
            text = prompt_strings
        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
-        return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
+        self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"])
        return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors)
    # Copied from transformers.models.llava_next.processing_llava_next.LlavaNextProcessor._get_number_of_features
    def _get_number_of_features(self, orig_height: int, orig_width: int, height: int, width: int) -> int:
--- a/src/transformers/models/llava_onevision/processing_llava_onevision.py
+++ b/src/transformers/models/llava_onevision/processing_llava_onevision.py
@@ -171,7 +171,7 @@ class LlavaOnevisionProcessor(ProcessorMixin):
                to_numpy_array(image_inputs["pixel_values"][0][0]),
                channel_dim=output_kwargs["images_kwargs"].get("data_format"),
            )
-            text = self._expand_image_tokens(text, image_sizes, height, width, self.image_token)
+            text, num_image_tokens = self._expand_image_tokens(text, image_sizes, height, width, self.image_token)
        if videos is not None:
            video_inputs = self.video_processor(videos, **output_kwargs["videos_kwargs"])
@@ -188,8 +188,11 @@ class LlavaOnevisionProcessor(ProcessorMixin):
            num_video_tokens = (num_frames * pooled_height_width * pooled_height_width) + 1  # +1 for newline token
            text = [sample.replace(self.video_token, self.video_token * num_video_tokens) for sample in text]
        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
-        return BatchFeature(data={**text_inputs, **image_inputs, **video_inputs})
+        self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
        return BatchFeature(data={**text_inputs, **image_inputs, **video_inputs}, tensor_type=return_tensors)
    def _expand_image_tokens(
        self,
@@ -201,6 +204,7 @@ class LlavaOnevisionProcessor(ProcessorMixin):
        num_frames: int = 1,
    ):
        prompt_strings = []
        max_num_vision_tokens = 0
        for sample in text:
            while special_token in sample:
                image_size_list = next(image_sizes)
@@ -210,12 +214,13 @@ class LlavaOnevisionProcessor(ProcessorMixin):
                    original_size = original_size.tolist()
                orig_height, orig_width = original_size
                num_image_tokens = self._get_number_of_features(orig_height, orig_width, height, width)
                max_num_vision_tokens = max(max_num_vision_tokens, num_image_tokens)
                if self.vision_feature_select_strategy == "default":
                    num_image_tokens -= 1
                sample = sample.replace(special_token, "<placeholder>" * num_image_tokens * num_frames, 1)
            prompt_strings.append(sample)
        text = [sample.replace("<placeholder>", special_token) for sample in prompt_strings]
-        return text
+        return text, max_num_vision_tokens
    def _get_number_of_features(self, orig_height: int, orig_width: int, height: int, width: int) -> int:
        image_grid_pinpoints = self.image_processor.image_grid_pinpoints
--- a/src/transformers/models/mllama/processing_mllama.py
+++ b/src/transformers/models/mllama/processing_mllama.py
@@ -274,6 +274,7 @@ class MllamaProcessor(ProcessorMixin):
        )
        text_kwargs = output_kwargs["text_kwargs"]
        text_kwargs["return_tensors"] = None
        images_kwargs = output_kwargs["images_kwargs"]
        common_kwargs = output_kwargs["common_kwargs"]
@@ -287,6 +288,8 @@ class MllamaProcessor(ProcessorMixin):
            text = [build_string_from_input(text_item, self.bos_token, self.image_token) for text_item in text]
            _ = text_kwargs.pop("padding_side", None)  # hack until padding-side is an accepted kwarg by tokenizers
            encoding = self.tokenizer(text, **text_kwargs)
            self._check_special_mm_tokens(text, encoding, modalities=["image"])
            n_images_in_ids = [token_ids.count(self.image_token_id) for token_ids in encoding["input_ids"]]
            data.update(encoding)
        n_images_in_images = [0]
@@ -301,13 +304,18 @@ class MllamaProcessor(ProcessorMixin):
                raise ValueError(
                    "If a batch of text is provided, there should be either no images or at least one image per sample"
                )
-            if sum(n_images_in_text) > 0 and n_images_in_images != n_images_in_text:
+            if sum(n_images_in_text) > 0 and (
                n_images_in_images != n_images_in_text or n_images_in_ids != n_images_in_images
            ):
                if images is None:
                    raise ValueError("No image were provided, but there are image tokens in the prompt")
                else:
                    add_message = ""
-                    if sum(n_images_in_images) == sum(n_images_in_text):
+                    if sum(n_images_in_images) == sum(n_images_in_text) and n_images_in_images != n_images_in_text:
                        add_message = "Make sure to pass your images as a nested list, where each sub-list holds images per batch"
                    elif n_images_in_ids != n_images_in_images:
                        add_message = "If you activated truncation with `max_length`, increase the `max_length` so image tokens aren't cropped."
                    raise ValueError(
                        f"The number of image tokens in each text ({n_images_in_text}) should be the same as the "
                        f"number of provided images per batch ({n_images_in_images}). {add_message}"
--- a/src/transformers/models/paligemma/processing_paligemma.py
+++ b/src/transformers/models/paligemma/processing_paligemma.py
@@ -298,23 +298,21 @@ class PaliGemmaProcessor(ProcessorMixin):
            suffix = [sfx + self.tokenizer.eos_token for sfx in suffix]
        pixel_values = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"]
-        # max_length has to account for the image tokens
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
        if output_kwargs["text_kwargs"].get("max_length", None) is not None:
            output_kwargs["text_kwargs"]["max_length"] += self.image_seq_length
        inputs = self.tokenizer(
            input_strings,
            text_pair=suffix,
            return_token_type_ids=return_token_type_ids,
            **output_kwargs["text_kwargs"],
        )
        self._check_special_mm_tokens(input_strings, inputs, modalities=["image"])
        return_data = {**inputs, "pixel_values": pixel_values}
        if return_token_type_ids:
            labels = inputs["input_ids"].masked_fill(inputs["token_type_ids"] == 0, -100)
            return_data.update({"labels": labels})
-        return BatchFeature(data=return_data)
+        return BatchFeature(data=return_data, tensor_type=return_tensors)
    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Gemma
    def batch_decode(self, *args, **kwargs):
--- a/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py
+++ b/src/transformers/models/phi4_multimodal/processing_phi4_multimodal.py
@@ -71,6 +71,10 @@ class Phi4MultimodalProcessor(ProcessorMixin):
        tokenizer,
        **kwargs,
    ):
        self.image_token = tokenizer.image_token
        self.image_token_id = tokenizer.image_token_id
        self.audio_token = tokenizer.audio_token
        self.audio_token_id = tokenizer.audio_token_id
        super().__init__(image_processor, audio_processor, tokenizer, **kwargs)
    def __call__(
@@ -113,7 +117,6 @@ class Phi4MultimodalProcessor(ProcessorMixin):
        output_kwargs = self._merge_kwargs(Phi4MultimodalProcessorKwargs, self.tokenizer.init_kwargs, **kwargs)
        image_kwargs = output_kwargs["images_kwargs"]
        audio_kwargs = output_kwargs["audio_kwargs"]
        text_kwargs = output_kwargs["text_kwargs"]
        image_inputs = self.image_processor(images, **image_kwargs) if images is not None else {}
        audio_inputs = self.audio_processor(audio, **audio_kwargs) if audio is not None else {}
@@ -154,7 +157,9 @@ class Phi4MultimodalProcessor(ProcessorMixin):
            re.sub(re.escape(audio_token), lambda _: audio_token * next(audio_count_iter), t) for t in processed_text
        ]
-        text_inputs = self.tokenizer(processed_text, **text_kwargs)
+        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
        text_inputs = self.tokenizer(processed_text, **output_kwargs["text_kwargs"])
        self._check_special_mm_tokens(processed_text, text_inputs, modalities=["image"])
        # prepare batch feature
        data = {
@@ -163,7 +168,7 @@ class Phi4MultimodalProcessor(ProcessorMixin):
            **audio_inputs,
        }
-        return BatchFeature(data=data)
+        return BatchFeature(data=data, tensor_type=return_tensors)
    def batch_decode(self, *args, **kwargs):
        """
--- a/src/transformers/models/pixtral/processing_pixtral.py
+++ b/src/transformers/models/pixtral/processing_pixtral.py
@@ -103,6 +103,7 @@ class PixtralProcessor(ProcessorMixin):
        self.patch_size = patch_size
        self.spatial_merge_size = spatial_merge_size
        self.image_token = image_token
        self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
        self.image_break_token = image_break_token
        self.image_end_token = image_end_token
        super().__init__(image_processor, tokenizer, chat_template=chat_template)
@@ -211,10 +212,10 @@ class PixtralProcessor(ProcessorMixin):
                    sample = sample.replace("<placeholder>", replace_str, 1)
                prompt_strings.append(sample)
        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
        text_inputs = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
-        return BatchFeature(
+        self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image"])
-            data={**text_inputs, **image_inputs}, tensor_type=output_kwargs["common_kwargs"]["return_tensors"]
+        return BatchFeature(data={**text_inputs, **image_inputs}, tensor_type=return_tensors)
        )
    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
    def batch_decode(self, *args, **kwargs):
--- a/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
+++ b/src/transformers/models/qwen2_5_vl/modular_qwen2_5_vl.py
@@ -941,16 +941,14 @@ class Qwen2_5_VLProcessor(Qwen2VLProcessor):
        if not isinstance(text, list):
            text = [text]
        text = text.copy()  # below lines change text in-place
        if image_grid_thw is not None:
            merge_length = self.image_processor.merge_size**2
            index = 0
            for i in range(len(text)):
                while self.image_token in text[i]:
-                    text[i] = text[i].replace(
+                    num_image_tokens = image_grid_thw[index].prod() // merge_length
-                        self.image_token,
+                    text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
                        "<|placeholder|>" * (image_grid_thw[index].prod() // merge_length),
                        1,
                    )
                    index += 1
                text[i] = text[i].replace("<|placeholder|>", self.image_token)
@@ -959,17 +957,16 @@ class Qwen2_5_VLProcessor(Qwen2VLProcessor):
            index = 0
            for i in range(len(text)):
                while self.video_token in text[i]:
-                    text[i] = text[i].replace(
+                    num_video_tokens = video_grid_thw[index].prod() // merge_length
-                        self.video_token,
+                    text[i] = text[i].replace(self.video_token, "<|placeholder|>" * num_video_tokens, 1)
                        "<|placeholder|>" * (video_grid_thw[index].prod() // merge_length),
                        1,
                    )
                    index += 1
                text[i] = text[i].replace("<|placeholder|>", self.video_token)
        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
        self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"])
-        return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
+        return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors)
 __all__ = [
--- a/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
+++ b/src/transformers/models/qwen2_5_vl/processing_qwen2_5_vl.py
@@ -77,6 +77,16 @@ class Qwen2_5_VLProcessor(ProcessorMixin):
    def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
        self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
        self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
        self.image_token_id = (
            tokenizer.image_token_id
            if getattr(tokenizer, "image_token_id", None)
            else tokenizer.convert_tokens_to_ids(self.image_token)
        )
        self.video_token_id = (
            tokenizer.video_token_id
            if getattr(tokenizer, "video_token_id", None)
            else tokenizer.convert_tokens_to_ids(self.video_token)
        )
        super().__init__(image_processor, tokenizer, chat_template=chat_template)
    def __call__(
@@ -157,16 +167,14 @@ class Qwen2_5_VLProcessor(ProcessorMixin):
        if not isinstance(text, list):
            text = [text]
        text = text.copy()  # below lines change text in-place
        if image_grid_thw is not None:
            merge_length = self.image_processor.merge_size**2
            index = 0
            for i in range(len(text)):
                while self.image_token in text[i]:
-                    text[i] = text[i].replace(
+                    num_image_tokens = image_grid_thw[index].prod() // merge_length
-                        self.image_token,
+                    text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
                        "<|placeholder|>" * (image_grid_thw[index].prod() // merge_length),
                        1,
                    )
                    index += 1
                text[i] = text[i].replace("<|placeholder|>", self.image_token)
@@ -175,17 +183,16 @@ class Qwen2_5_VLProcessor(ProcessorMixin):
            index = 0
            for i in range(len(text)):
                while self.video_token in text[i]:
-                    text[i] = text[i].replace(
+                    num_video_tokens = video_grid_thw[index].prod() // merge_length
-                        self.video_token,
+                    text[i] = text[i].replace(self.video_token, "<|placeholder|>" * num_video_tokens, 1)
                        "<|placeholder|>" * (video_grid_thw[index].prod() // merge_length),
                        1,
                    )
                    index += 1
                text[i] = text[i].replace("<|placeholder|>", self.video_token)
        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
        self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"])
-        return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
+        return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors)
    def batch_decode(self, *args, **kwargs):
        """
--- a/src/transformers/models/qwen2_audio/processing_qwen2_audio.py
+++ b/src/transformers/models/qwen2_audio/processing_qwen2_audio.py
@@ -76,6 +76,7 @@ class Qwen2AudioProcessor(ProcessorMixin):
        if chat_template is None:
            chat_template = self.default_chat_template
        self.audio_token = tokenizer.audio_token if hasattr(tokenizer, "audio_token") else audio_token
        self.audio_token_id = tokenizer.convert_tokens_to_ids(self.audio_token)
        self.audio_bos_token = tokenizer.audio_bos_token if hasattr(tokenizer, "audio_bos_token") else audio_bos_token
        self.audio_eos_token = tokenizer.audio_eos_token if hasattr(tokenizer, "audio_eos_token") else audio_eos_token
        super().__init__(feature_extractor, tokenizer, chat_template=chat_template)
@@ -179,12 +180,14 @@ class Qwen2AudioProcessor(ProcessorMixin):
                expanded_text.append(sample)
            text = expanded_text
        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
        inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
        self._check_special_mm_tokens(text, inputs, modalities=["audio"])
        if audio is not None:
            inputs.update(audio_inputs)
-        return BatchFeature(data={**inputs})
+        return BatchFeature(data={**inputs}, tensor_type=return_tensors)
    def batch_decode(self, *args, **kwargs):
        """
--- a/src/transformers/models/qwen2_vl/processing_qwen2_vl.py
+++ b/src/transformers/models/qwen2_vl/processing_qwen2_vl.py
@@ -72,6 +72,16 @@ class Qwen2VLProcessor(ProcessorMixin):
    def __init__(self, image_processor=None, tokenizer=None, chat_template=None, **kwargs):
        self.image_token = "<|image_pad|>" if not hasattr(tokenizer, "image_token") else tokenizer.image_token
        self.video_token = "<|video_pad|>" if not hasattr(tokenizer, "video_token") else tokenizer.video_token
        self.image_token_id = (
            tokenizer.image_token_id
            if getattr(tokenizer, "image_token_id", None)
            else tokenizer.convert_tokens_to_ids(self.image_token)
        )
        self.video_token_id = (
            tokenizer.video_token_id
            if getattr(tokenizer, "video_token_id", None)
            else tokenizer.convert_tokens_to_ids(self.video_token)
        )
        super().__init__(image_processor, tokenizer, chat_template=chat_template)
    def __call__(
@@ -139,14 +149,15 @@ class Qwen2VLProcessor(ProcessorMixin):
        if not isinstance(text, list):
            text = [text]
        text = text.copy()  # below lines change text in-place
        if image_grid_thw is not None:
            merge_length = self.image_processor.merge_size**2
            index = 0
            for i in range(len(text)):
                while self.image_token in text[i]:
-                    text[i] = text[i].replace(
+                    num_image_tokens = image_grid_thw[index].prod() // merge_length
-                        self.image_token, "<|placeholder|>" * (image_grid_thw[index].prod() // merge_length), 1
+                    text[i] = text[i].replace(self.image_token, "<|placeholder|>" * num_image_tokens, 1)
                    )
                    index += 1
                text[i] = text[i].replace("<|placeholder|>", self.image_token)
@@ -155,15 +166,15 @@ class Qwen2VLProcessor(ProcessorMixin):
            index = 0
            for i in range(len(text)):
                while self.video_token in text[i]:
-                    text[i] = text[i].replace(
+                    num_video_tokens = video_grid_thw[index].prod() // merge_length
-                        self.video_token, "<|placeholder|>" * (video_grid_thw[index].prod() // merge_length), 1
+                    text[i] = text[i].replace(self.video_token, "<|placeholder|>" * num_video_tokens, 1)
                    )
                    index += 1
                text[i] = text[i].replace("<|placeholder|>", self.video_token)
        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
        text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
-
+        self._check_special_mm_tokens(text, text_inputs, modalities=["image", "video"])
-        return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs})
+        return BatchFeature(data={**text_inputs, **image_inputs, **videos_inputs}, tensor_type=return_tensors)
    def batch_decode(self, *args, **kwargs):
        """
--- a/src/transformers/models/smolvlm/processing_smolvlm.py
+++ b/src/transformers/models/smolvlm/processing_smolvlm.py
@@ -149,6 +149,7 @@ class SmolVLMProcessor(ProcessorMixin):
    ):
        self.fake_image_token = getattr(tokenizer, "fake_image_token", "<fake_token_around_image>")
        self.image_token = getattr(tokenizer, "image_token", "<image>")
        self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
        self.end_of_utterance_token = getattr(tokenizer, "end_of_utterance_token", "<end_of_utterance>")
        self.global_image_token = getattr(tokenizer, "global_image_token", "<global-img>")
        self.image_seq_len = image_seq_len
@@ -290,7 +291,7 @@ class SmolVLMProcessor(ProcessorMixin):
            if n_images_in_text > 0 and (images is None and videos is None):
                raise ValueError(f"We detected {n_images_in_text} tokens in the text but no images/videos were passed")
-        inputs = BatchFeature()
+        inputs = {}
        # Images and videos are mutually exclusive, so process one which is present
        if images is not None:
            images = make_nested_list_of_images(images)
@@ -313,11 +314,14 @@ class SmolVLMProcessor(ProcessorMixin):
            )
            inputs.update(vision_inputs)
        return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
        if text is not None:
-            text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"])
+            text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
            self._check_special_mm_tokens(text, text_inputs, modalities=["image"])
            inputs.update(text_inputs)
-        return inputs
+        return BatchFeature(inputs, tensor_type=return_tensors)
    def _process_messages_for_chat_template(
        self,
--- a/src/transformers/models/video_llava/processing_video_llava.py
+++ b/src/transformers/models/video_llava/processing_video_llava.py
@@ -87,6 +87,8 @@ class VideoLlavaProcessor(ProcessorMixin):
        self.vision_feature_select_strategy = vision_feature_select_strategy
        self.image_token = tokenizer.image_token if hasattr(tokenizer, "image_token") else image_token
        self.video_token = tokenizer.video_token if hasattr(tokenizer, "video_token") else video_token
        self.image_token_id = tokenizer.convert_tokens_to_ids(self.image_token)
        self.video_token_id = tokenizer.convert_tokens_to_ids(self.video_token)
        super().__init__(image_processor, tokenizer, chat_template=chat_template)
    def __call__(
@@ -195,14 +197,16 @@ class VideoLlavaProcessor(ProcessorMixin):
        text_inputs = self.tokenizer(
            prompt_strings,
-            return_tensors=return_tensors,
+            return_tensors=None,
            padding=padding,
            truncation=truncation,
            max_length=max_length,
        )
        self._check_special_mm_tokens(prompt_strings, text_inputs, modalities=["image", "video"])
        data.update(text_inputs)
-        return BatchFeature(data=data)
+        return BatchFeature(data=data, tensor_type=return_tensors)
    # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
    def batch_decode(self, *args, **kwargs):
--- a/src/transformers/processing_utils.py
+++ b/src/transformers/processing_utils.py
@@ -31,6 +31,7 @@ from huggingface_hub.errors import EntryNotFoundError
 from .audio_utils import load_audio
 from .dynamic_module_utils import custom_object_save
 from .feature_extraction_utils import BatchFeature
 from .image_utils import (
    ChannelDimension,
    ImageInput,
@@ -1615,6 +1616,23 @@ class ProcessorMixin(PushToHubMixin):
        """
        return self.tokenizer.batch_decode(generated_outputs, skip_special_tokens=skip_special_tokens, **kwargs)
    def _check_special_mm_tokens(self, text: list[str], text_inputs: "BatchFeature", modalities: list[str]):
        """
        Checks that number of special tokens in text and processed text is same. The count can be different
        if tokenized text was truncated, leading to issues in model code.
        """
        for modality in modalities:
            token_str = getattr(self, f"{modality}_token")
            token_id = getattr(self, f"{modality}_token_id")
            ids_count = [list(ids).count(token_id) for ids in text_inputs["input_ids"]]
            text_count = [sample.count(token_str) for sample in text]
            if ids_count != text_count:
                raise ValueError(
                    f"Mismatch in `{modality}` token count between text and `input_ids`. Got ids={ids_count} and text={text_count}. "
                    "Likely due to `truncation='max_length'`. Please disable truncation or increase `max_length`."
                )
 def _validate_images_text_input_order(images, text):
    """
--- a/tests/models/aria/test_processor_aria.py
+++ b/tests/models/aria/test_processor_aria.py
@@ -271,3 +271,29 @@ And who is that?<|im_end|>
            return_tensors="np",
        )
        self.assertListEqual(list(out_dict[self.images_input_name].shape), [1, 3, 980, 980])
    def test_special_mm_token_truncation(self):
        """Tests that special vision tokens do not get truncated when `truncation=True` is set."""
        processor = self.get_processor()
        input_str = self.prepare_text_inputs(batch_size=2, modality="image")
        image_input = self.prepare_image_inputs(batch_size=2)
        _ = processor(
            text=input_str,
            images=image_input,
            return_tensors="pt",
            truncation=None,
            padding=True,
        )
        with self.assertRaises(ValueError):
            _ = processor(
                text=input_str,
                images=image_input,
                return_tensors="pt",
                truncation=True,
                padding=True,
                max_length=3,
            )
--- a/tests/models/chameleon/test_processor_chameleon.py
+++ b/tests/models/chameleon/test_processor_chameleon.py
@@ -40,10 +40,37 @@ class ChameleonProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        tokenizer = LlamaTokenizer(vocab_file=SAMPLE_VOCAB)
        tokenizer.pad_token_id = 0
        tokenizer.sep_token_id = 1
        tokenizer.add_special_tokens({"additional_special_tokens": ["<image>"]})
        processor = cls.processor_class(image_processor=image_processor, tokenizer=tokenizer, image_seq_length=2)
        processor.save_pretrained(cls.tmpdirname)
        cls.image_token = processor.image_token
    def test_special_mm_token_truncation(self):
        """Tests that special vision tokens do not get truncated when `truncation=True` is set."""
        processor = self.get_processor()
        input_str = self.prepare_text_inputs(batch_size=2, modality="image")
        image_input = self.prepare_image_inputs(batch_size=2)
        _ = processor(
            text=input_str,
            images=image_input,
            return_tensors="pt",
            truncation=None,
            padding=True,
        )
        with self.assertRaises(ValueError):
            _ = processor(
                text=input_str,
                images=image_input,
                return_tensors="pt",
                truncation=True,
                padding=True,
                max_length=20,
            )
    @staticmethod
    def prepare_processor_dict():
        return {"image_seq_length": 2}  # fmt: skip
--- a/tests/models/gemma3/test_processing_gemma3.py
+++ b/tests/models/gemma3/test_processing_gemma3.py
@@ -124,3 +124,28 @@ class Gemma3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        # base image + 4 crops
        self.assertEqual(len(inputs[self.images_input_name]), 5)
        self.assertEqual(len(inputs[self.text_input_name][0]), 67)
    def test_special_mm_token_truncation(self):
        """Tests that special vision tokens do not get truncated when `truncation=True` is set."""
        processor = self.get_processor()
        input_str = self.prepare_text_inputs(batch_size=2, modality="image")
        image_input = self.prepare_image_inputs(batch_size=2)
        _ = processor(
            text=input_str,
            images=image_input,
            return_tensors="pt",
            truncation=None,
            padding=True,
        )
        with self.assertRaises(ValueError):
            _ = processor(
                text=input_str,
                images=image_input,
                return_tensors="pt",
                truncation=True,
                padding=True,
                max_length=5,
            )
--- a/tests/models/idefics2/test_processor_idefics2.py
+++ b/tests/models/idefics2/test_processor_idefics2.py
@@ -66,8 +66,8 @@ class Idefics2ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            )
        )
        cls.bos_token = processor.tokenizer.bos_token
-        cls.image_token = processor.image_token.content
+        cls.image_token = processor.image_token
-        cls.fake_image_token = processor.fake_image_token.content
+        cls.fake_image_token = processor.fake_image_token
        cls.bos_token_id = processor.tokenizer.convert_tokens_to_ids(cls.bos_token)
        cls.image_token_id = processor.tokenizer.convert_tokens_to_ids(cls.image_token)
--- a/tests/models/idefics3/test_processor_idefics3.py
+++ b/tests/models/idefics3/test_processor_idefics3.py
@@ -60,8 +60,8 @@ class Idefics3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            )
        )
        cls.bos_token = processor.tokenizer.bos_token
-        cls.image_token = processor.image_token.content
+        cls.image_token = processor.image_token
-        cls.fake_image_token = processor.fake_image_token.content
+        cls.fake_image_token = processor.fake_image_token
        cls.global_img_token = processor.global_image_tag
        cls.bos_token_id = processor.tokenizer.convert_tokens_to_ids(cls.bos_token)
--- a/tests/models/llava/test_processor_llava.py
+++ b/tests/models/llava/test_processor_llava.py
@@ -40,6 +40,7 @@ class LlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        image_processor = CLIPImageProcessor(do_center_crop=False)
        tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b")
        tokenizer.add_special_tokens({"additional_special_tokens": ["<image>"]})
        processor_kwargs = cls.prepare_processor_dict()
        processor = LlavaProcessor(image_processor, tokenizer, **processor_kwargs)
        processor.save_pretrained(cls.tmpdirname)
@@ -79,3 +80,29 @@ class LlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            processor = LlavaProcessor.from_pretrained(checkpoint)
            tokenizer = AutoTokenizer.from_pretrained(checkpoint)
            self.assertEqual(processor.tokenizer.__class__, tokenizer.__class__)
    def test_special_mm_token_truncation(self):
        """Tests that special vision tokens do not get truncated when `truncation=True` is set."""
        processor = LlavaProcessor.from_pretrained("llava-hf/llava-1.5-7b-hf")
        input_str = self.prepare_text_inputs(batch_size=2, modality="image")
        image_input = self.prepare_image_inputs(batch_size=2)
        _ = processor(
            text=input_str,
            images=image_input,
            return_tensors="pt",
            truncation=None,
            padding=True,
        )
        with self.assertRaises(ValueError):
            _ = processor(
                text=input_str,
                images=image_input,
                return_tensors="pt",
                truncation=True,
                padding=True,
                max_length=5,
            )
--- a/tests/models/llava_next/test_processor_llava_next.py
+++ b/tests/models/llava_next/test_processor_llava_next.py
@@ -40,6 +40,7 @@ class LlavaNextProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        image_processor = LlavaNextImageProcessor()
        tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b")
        tokenizer.add_special_tokens({"additional_special_tokens": ["<image>"]})
        processor_kwargs = cls.prepare_processor_dict()
        processor = LlavaNextProcessor(image_processor, tokenizer, **processor_kwargs)
        processor.save_pretrained(cls.tmpdirname)
--- a/tests/models/llava_next_video/test_processor_llava_next_video.py
+++ b/tests/models/llava_next_video/test_processor_llava_next_video.py
@@ -41,6 +41,7 @@ class LlavaNextVideoProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        image_processor = LlavaNextImageProcessor()
        video_processor = LlavaNextVideoImageProcessor()
        tokenizer = LlamaTokenizerFast.from_pretrained("llava-hf/LLaVA-NeXT-Video-7B-hf")
        tokenizer.add_special_tokens({"additional_special_tokens": ["<image>", "<video>"]})
        processor_kwargs = cls.prepare_processor_dict()
        processor = LlavaNextVideoProcessor(
--- a/tests/models/llava_onevision/test_processor_llava_onevision.py
+++ b/tests/models/llava_onevision/test_processor_llava_onevision.py
@@ -45,6 +45,7 @@ class LlavaOnevisionProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        image_processor = LlavaOnevisionImageProcessor()
        video_processor = LlavaOnevisionVideoProcessor()
        tokenizer = Qwen2TokenizerFast.from_pretrained("Qwen/Qwen2-0.5B-Instruct")
        tokenizer.add_special_tokens({"additional_special_tokens": ["<image>", "<video>"]})
        processor_kwargs = cls.prepare_processor_dict()
        processor = LlavaOnevisionProcessor(
--- a/tests/models/mistral3/test_processor_mistral3.py
+++ b/tests/models/mistral3/test_processor_mistral3.py
@@ -290,3 +290,29 @@ class Mistral3ProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        inputs_image = processor(text=prompt_string, images=image_inputs, return_tensors="pt", padding=True)
        self.assertIn("input_ids", inputs_image)
        self.assertTrue(len(inputs_image["input_ids"]) == 5)
    def test_special_mm_token_truncation(self):
        """Tests that special vision tokens do not get truncated when `truncation=True` is set."""
        processor = self.get_processor()
        input_str = self.prepare_text_inputs(batch_size=2, modality="image")
        image_input = self.prepare_image_inputs(batch_size=2)
        _ = processor(
            text=input_str,
            images=image_input,
            return_tensors="pt",
            truncation=None,
            padding=True,
        )
        with self.assertRaises(ValueError):
            _ = processor(
                text=input_str,
                images=image_input,
                return_tensors="pt",
                truncation=True,
                padding=True,
                max_length=3,
            )
--- a/tests/models/mllama/test_processor_mllama.py
+++ b/tests/models/mllama/test_processor_mllama.py
@@ -360,3 +360,29 @@ class MllamaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            len(inputs[self.text_input_name][0]) == len(inputs[self.text_input_name][1])
            and len(inputs[self.text_input_name][1]) < 76
        )
    def test_special_mm_token_truncation(self):
        """Tests that special vision tokens do not get truncated when `truncation=True` is set."""
        processor = self.get_processor()
        input_str = self.prepare_text_inputs(batch_size=2, modality="image")
        image_input = self.prepare_image_inputs(batch_size=2)
        image_input = [[image_input[0]], [image_input[1]]]
        _ = processor(
            text=input_str,
            images=image_input,
            return_tensors="pt",
            truncation=None,
            padding=True,
        )
        with self.assertRaises(ValueError):
            _ = processor(
                text=input_str,
                images=image_input,
                return_tensors="pt",
                truncation=True,
                padding=True,
                max_length=3,
            )
--- a/tests/models/paligemma/test_processor_paligemma.py
+++ b/tests/models/paligemma/test_processor_paligemma.py
@@ -39,6 +39,7 @@ class PaliGemmaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        image_processor = SiglipImageProcessor.from_pretrained("google/siglip-so400m-patch14-384")
        image_processor.image_seq_length = 0  # TODO: raushan fix me in #37342
        tokenizer = GemmaTokenizer(SAMPLE_VOCAB, keep_accents=True)
        tokenizer.add_special_tokens({"additional_special_tokens": ["<image>"]})
        processor = PaliGemmaProcessor(image_processor=image_processor, tokenizer=tokenizer)
        processor.save_pretrained(cls.tmpdirname)
        cls.image_token = processor.image_token
@@ -59,7 +60,7 @@ class PaliGemmaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        inputs = processor(
            text=input_str, images=image_input, return_tensors="pt", max_length=112, padding="max_length"
        )
-        self.assertEqual(len(inputs["input_ids"][0]), 112 + 14)
+        self.assertEqual(len(inputs["input_ids"][0]), 112)
    def test_text_with_image_tokens(self):
        image_processor = self.get_component("image_processor")
--- a/tests/models/qwen2_vl/test_processor_qwen2_vl.py
+++ b/tests/models/qwen2_vl/test_processor_qwen2_vl.py
@@ -397,3 +397,29 @@ class Qwen2VLProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        self.assertEqual(inputs[self.images_input_name].shape[0], 100)
        inputs = processor(text=input_str, images=image_input, max_pixels=56 * 56 * 4, return_tensors="pt")
        self.assertEqual(inputs[self.images_input_name].shape[0], 612)
    def test_special_mm_token_truncation(self):
        """Tests that special vision tokens do not get truncated when `truncation=True` is set."""
        processor = self.get_processor()
        input_str = self.prepare_text_inputs(batch_size=2, modality="image")
        image_input = self.prepare_image_inputs(batch_size=2)
        _ = processor(
            text=input_str,
            images=image_input,
            return_tensors="pt",
            truncation=None,
            padding=True,
        )
        with self.assertRaises(ValueError):
            _ = processor(
                text=input_str,
                images=image_input,
                return_tensors="pt",
                truncation=True,
                padding=True,
                max_length=20,
            )
--- a/tests/models/smolvlm/test_processor_smolvlm.py
+++ b/tests/models/smolvlm/test_processor_smolvlm.py
@@ -435,7 +435,8 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        image_processor = self.get_component("image_processor")
        tokenizer = self.get_component("tokenizer")
-        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
+        processor_kwargs = self.prepare_processor_dict()
        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor, **processor_kwargs)
        self.skip_processor_without_typed_kwargs(processor)
        input_str = self.prepare_text_inputs(batch_size=2, modality="image")
@@ -445,14 +446,14 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            text=input_str,
            images=image_input,
            return_tensors="pt",
-            padding="longest",
+            padding="max_length",
            max_length=76,
            truncation=True,
-            max_image_size={"longest_edge": 30},
+            max_image_size={"longest_edge": 300},
        )
        self.assertEqual(inputs["pixel_values"].shape[2], 3)
-        self.assertEqual(inputs["pixel_values"].shape[3], 30)
+        self.assertEqual(inputs["pixel_values"].shape[3], 300)
        self.assertEqual(len(inputs["input_ids"][0]), 76)
    @require_torch
@@ -529,3 +530,29 @@ class SmolVLMProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        with self.assertRaises(ValueError) as context:
            processor(text=texts, images=None)
        self.assertTrue("tokens in the text but no images/videos were passed" in str(context.exception))
    def test_special_mm_token_truncation(self):
        """Tests that special vision tokens do not get truncated when `truncation=True` is set."""
        processor = self.get_processor()
        input_str = self.prepare_text_inputs(batch_size=2, modality="image")
        image_input = self.prepare_image_inputs(batch_size=2)
        image_input = [[image_input[0]], [image_input[1]]]
        _ = processor(
            text=input_str,
            images=image_input,
            return_tensors="pt",
            truncation=None,
            padding=True,
        )
        with self.assertRaises(ValueError):
            _ = processor(
                text=input_str,
                images=image_input,
                return_tensors="pt",
                truncation=True,
                padding=True,
                max_length=20,
            )