Add support for post-processing kwargs in image-text-to-text pipeline (#35374)

* fix error and improve pipeline * add processing_kwargs to apply_chat_template * change default post_process kwarg to args * Fix slow tests * fix copies
2025-02-18 17:43:36 -05:00
parent 9b479a245b
commit 9f51dc2535
8 changed files with 91 additions and 41 deletions
--- a/tests/pipelines/test_pipelines_image_text_to_text.py
+++ b/tests/pipelines/test_pipelines_image_text_to_text.py
@@ -124,7 +124,7 @@ class ImageTextToTextPipelineTests(unittest.TestCase):
                ],
            }
        ]
-        outputs = pipe([image_ny, image_chicago], text=messages)
+        outputs = pipe([image_ny, image_chicago], text=messages, return_full_text=False, max_new_tokens=10)
        self.assertEqual(
            outputs,
            [
@@ -139,20 +139,7 @@ class ImageTextToTextPipelineTests(unittest.TestCase):
                            ],
                        }
                    ],
-                    "generated_text": [
-                        {
-                            "role": "user",
-                            "content": [
-                                {"type": "text", "text": "What’s the difference between these two images?"},
-                                {"type": "image"},
-                                {"type": "image"},
-                            ],
-                        },
-                        {
-                            "role": "assistant",
-                            "content": "The first image shows a statue of the Statue of Liberty in the foreground, while the second image shows",
-                        },
-                    ],
+                    "generated_text": "The first image shows a statue of Liberty in the",
                }
            ],
        )
@@ -179,7 +166,7 @@ class ImageTextToTextPipelineTests(unittest.TestCase):
                ],
            },
        ]
-        outputs = pipe(text=messages)
+        outputs = pipe(text=messages, max_new_tokens=10)
        self.assertEqual(
            outputs,
            [
@@ -213,7 +200,7 @@ class ImageTextToTextPipelineTests(unittest.TestCase):
                            "content": [
                                {
                                    "type": "text",
-                                    "text": "There is a dog and a person in the image. The dog is sitting on the sand, and the person is sitting on",
+                                    "text": "There is a dog and a person in the image. The dog is sitting",
                                }
                            ],
                        },
@@ -238,7 +225,7 @@ class ImageTextToTextPipelineTests(unittest.TestCase):
                ],
            }
        ]
-        outputs = pipe(text=messages, return_full_text=False)
+        outputs = pipe(text=messages, return_full_text=False, max_new_tokens=10)
        self.assertEqual(
            outputs,
            [
@@ -255,7 +242,7 @@ class ImageTextToTextPipelineTests(unittest.TestCase):
                            ],
                        }
                    ],
-                    "generated_text": "In the image, a woman is sitting on the sandy beach, her legs crossed in a relaxed manner",
+                    "generated_text": "In the image, a woman is sitting on the",
                }
            ],
        )
@@ -263,7 +250,7 @@ class ImageTextToTextPipelineTests(unittest.TestCase):
    @slow
    @require_torch
    def test_model_pt_chat_template_image_url(self):
-        pipe = pipeline("image-text-to-text", model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
+        pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
        messages = [
            {
                "role": "user",
@@ -279,7 +266,7 @@ class ImageTextToTextPipelineTests(unittest.TestCase):
            }
        ]
        outputs = pipe(text=messages, return_full_text=False, max_new_tokens=10)[0]["generated_text"]
-        self.assertEqual(outputs, "The image captures the iconic Statue of Liberty, a")
+        self.assertEqual(outputs, "A statue of liberty in the foreground of a city")

    @slow
    @require_torch