From be37d34f44ff1bc928e59ffb8a30adecab8835a8 Mon Sep 17 00:00:00 2001
From: Yoni Gozlan <74535834+yonigozlan@users.noreply.github.com>
Date: Tue, 25 Mar 2025 17:32:17 -0400
Subject: [PATCH] =?UTF-8?q?=F0=9F=9A=A8Deprecate=20legacy=20argument=20for?=
 =?UTF-8?q?=20image-text-to-text=20models=20and=20adopt=20new=20behavior?=
 =?UTF-8?q?=20by=20default=20(#36307)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* deprecate legacy argument and adopt new behavior by default

* revert back modification git
---
 src/transformers/models/donut/processing_donut.py   | 13 +------------
 src/transformers/models/git/processing_git.py       | 12 ------------
 .../models/pix2struct/processing_pix2struct.py      | 11 +----------
 3 files changed, 2 insertions(+), 34 deletions(-)

diff --git a/src/transformers/models/donut/processing_donut.py b/src/transformers/models/donut/processing_donut.py
index 04ddf901c6..689aa5122f 100644
--- a/src/transformers/models/donut/processing_donut.py
+++ b/src/transformers/models/donut/processing_donut.py
@@ -88,17 +88,6 @@ class DonutProcessor(ProcessorMixin):
         [`~DonutProcessor.as_target_processor`] this method forwards all its arguments to DonutTokenizer's
         [`~DonutTokenizer.__call__`]. Please refer to the docstring of the above two methods for more information.
         """
-        # For backward compatibility
-        legacy = kwargs.pop("legacy", True)
-        if legacy:
-            # With `add_special_tokens=True`, the performance of donut are degraded when working with both images and text.
-            logger.warning_once(
-                "Legacy behavior is being used. The current behavior will be deprecated in version 5.0.0. "
-                "In the new behavior, if both images and text are provided, the default value of `add_special_tokens` "
-                "will be changed to `False` when calling the tokenizer if `add_special_tokens` is unset. "
-                "To test the new behavior, set `legacy=False`as a processor call argument."
-            )
-
         if self._in_target_context_manager:
             return self.current_processor(images, text, **kwargs)
 
@@ -114,7 +103,7 @@ class DonutProcessor(ProcessorMixin):
         if images is not None:
             inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
         if text is not None:
-            if not legacy and images is not None:
+            if images is not None:
                 output_kwargs["text_kwargs"].setdefault("add_special_tokens", False)
             encodings = self.tokenizer(text, **output_kwargs["text_kwargs"])
 
diff --git a/src/transformers/models/git/processing_git.py b/src/transformers/models/git/processing_git.py
index 2f1b35cb7c..98cc3b83cf 100644
--- a/src/transformers/models/git/processing_git.py
+++ b/src/transformers/models/git/processing_git.py
@@ -95,15 +95,6 @@ class GitProcessor(ProcessorMixin):
               `None`).
             - **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
         """
-        legacy = kwargs.pop("legacy", True)
-        if legacy:
-            logger.warning_once(
-                "Legacy behavior is being used. The current behavior will be deprecated in version 5.0.0. "
-                "In the new behavior, if both images and text are provided, the last token (EOS token) "
-                "of the input_ids and attention_mask tensors will be removed. "
-                "To test the new behavior, set `legacy=False`as a processor call argument."
-            )
-
         if text is None and images is None:
             raise ValueError("You have to specify either text or images. Both cannot be none.")
 
@@ -123,9 +114,6 @@ class GitProcessor(ProcessorMixin):
         if images is not None:
             image_features = self.image_processor(images, **output_kwargs["images_kwargs"])
             data.update(image_features)
-            if not legacy:
-                data["input_ids"] = data["input_ids"][:, :-1]
-                data["attention_mask"] = data["attention_mask"][:, :-1]
 
         return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"].get("return_tensors"))
 
diff --git a/src/transformers/models/pix2struct/processing_pix2struct.py b/src/transformers/models/pix2struct/processing_pix2struct.py
index ac9802dac8..f9b1fcc440 100644
--- a/src/transformers/models/pix2struct/processing_pix2struct.py
+++ b/src/transformers/models/pix2struct/processing_pix2struct.py
@@ -89,15 +89,6 @@ class Pix2StructProcessor(ProcessorMixin):
 
         Please refer to the docstring of the above two methods for more information.
         """
-        legacy = kwargs.pop("legacy", True)
-        if legacy:
-            logger.warning_once(
-                "Legacy behavior is being used. The current behavior will be deprecated in version 5.0.0. "
-                "In the new behavior, If both images and text are provided, image_processor is not a VQA processor, and `add_special_tokens` is unset, "
-                "the default value of `add_special_tokens` will be changed to `False` when calling the tokenizer. "
-                "To test the new behavior, set `legacy=False`as a processor call argument."
-            )
-
         if images is None and text is None:
             raise ValueError("You have to specify either images or text.")
 
@@ -126,7 +117,7 @@ class Pix2StructProcessor(ProcessorMixin):
 
         if text is not None and not self.image_processor.is_vqa:
             output_kwargs["text_kwargs"]["add_special_tokens"] = (
-                add_special_tokens if add_special_tokens is not None else legacy
+                add_special_tokens if add_special_tokens is not None else False
             )
             text_encoding = self.tokenizer(text=text, **output_kwargs["text_kwargs"])