From 21741e8c7efb13c9263985be1a2cd04c132bcf99 Mon Sep 17 00:00:00 2001
From: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
Date: Tue, 16 May 2023 14:49:24 +0200
Subject: [PATCH] Update `test_batched_inference_image_captioning_conditioned`
 (#23391)

* fix

* fix

* fix test + add more docs

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
Co-authored-by: younesbelkada <younesbelkada@gmail.com>
---
 docs/source/en/model_doc/pix2struct.mdx             | 2 ++
 tests/models/pix2struct/test_modeling_pix2struct.py | 9 ++++++---
 2 files changed, 8 insertions(+), 3 deletions(-)

diff --git a/docs/source/en/model_doc/pix2struct.mdx b/docs/source/en/model_doc/pix2struct.mdx
index c6d3136285..f4ead88f5c 100644
--- a/docs/source/en/model_doc/pix2struct.mdx
+++ b/docs/source/en/model_doc/pix2struct.mdx
@@ -25,6 +25,8 @@ Tips:
 Pix2Struct has been fine tuned on a variety of tasks and datasets, ranging from image captioning, visual question answering (VQA) over different inputs (books, charts, science diagrams), captioning UI components etc. The full list can be found in Table 1 of the paper.
 We therefore advise you to use these models for the tasks they have been fine tuned on. For instance, if you want to use Pix2Struct for UI captioning, you should use the model fine tuned on the UI dataset. If you want to use Pix2Struct for image captioning, you should use the model fine tuned on the natural images captioning dataset and so on.
 
+If you want to use the model to perform conditional text captioning, make sure to use the processor with `add_special_tokens=False`.
+
 This model was contributed by [ybelkada](https://huggingface.co/ybelkada).
 The original code can be found [here](https://github.com/google-research/pix2struct).
 
diff --git a/tests/models/pix2struct/test_modeling_pix2struct.py b/tests/models/pix2struct/test_modeling_pix2struct.py
index 42ee3c2b4c..4dbd7e649f 100644
--- a/tests/models/pix2struct/test_modeling_pix2struct.py
+++ b/tests/models/pix2struct/test_modeling_pix2struct.py
@@ -749,17 +749,20 @@ class Pix2StructIntegrationTest(unittest.TestCase):
         texts = ["A picture of", "An photography of"]
 
         # image only
-        inputs = processor(images=[image_1, image_2], text=texts, return_tensors="pt").to(torch_device)
+        inputs = processor(images=[image_1, image_2], text=texts, return_tensors="pt", add_special_tokens=False).to(
+            torch_device
+        )
 
         predictions = model.generate(**inputs)
 
         self.assertEqual(
-            processor.decode(predictions[0], skip_special_tokens=True), "A picture of a stop sign that says yes."
+            processor.decode(predictions[0], skip_special_tokens=True),
+            "A picture of a stop sign with a red stop sign on it.",
         )
 
         self.assertEqual(
             processor.decode(predictions[1], skip_special_tokens=True),
-            "An photography of the Temple Bar and a few other places.",
+            "An photography of the Temple Bar and the Temple Bar.",
         )
 
     def test_vqa_model(self):