Add support for post-processing kwargs in image-text-to-text pipeline (#35374)
* fix error and improve pipeline * add processing_kwargs to apply_chat_template * change default post_process kwarg to args * Fix slow tests * fix copies
This commit is contained in:
@@ -124,7 +124,7 @@ class ImageTextToTextPipelineTests(unittest.TestCase):
|
||||
],
|
||||
}
|
||||
]
|
||||
outputs = pipe([image_ny, image_chicago], text=messages)
|
||||
outputs = pipe([image_ny, image_chicago], text=messages, return_full_text=False, max_new_tokens=10)
|
||||
self.assertEqual(
|
||||
outputs,
|
||||
[
|
||||
@@ -139,20 +139,7 @@ class ImageTextToTextPipelineTests(unittest.TestCase):
|
||||
],
|
||||
}
|
||||
],
|
||||
"generated_text": [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "What’s the difference between these two images?"},
|
||||
{"type": "image"},
|
||||
{"type": "image"},
|
||||
],
|
||||
},
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "The first image shows a statue of the Statue of Liberty in the foreground, while the second image shows",
|
||||
},
|
||||
],
|
||||
"generated_text": "The first image shows a statue of Liberty in the",
|
||||
}
|
||||
],
|
||||
)
|
||||
@@ -179,7 +166,7 @@ class ImageTextToTextPipelineTests(unittest.TestCase):
|
||||
],
|
||||
},
|
||||
]
|
||||
outputs = pipe(text=messages)
|
||||
outputs = pipe(text=messages, max_new_tokens=10)
|
||||
self.assertEqual(
|
||||
outputs,
|
||||
[
|
||||
@@ -213,7 +200,7 @@ class ImageTextToTextPipelineTests(unittest.TestCase):
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "There is a dog and a person in the image. The dog is sitting on the sand, and the person is sitting on",
|
||||
"text": "There is a dog and a person in the image. The dog is sitting",
|
||||
}
|
||||
],
|
||||
},
|
||||
@@ -238,7 +225,7 @@ class ImageTextToTextPipelineTests(unittest.TestCase):
|
||||
],
|
||||
}
|
||||
]
|
||||
outputs = pipe(text=messages, return_full_text=False)
|
||||
outputs = pipe(text=messages, return_full_text=False, max_new_tokens=10)
|
||||
self.assertEqual(
|
||||
outputs,
|
||||
[
|
||||
@@ -255,7 +242,7 @@ class ImageTextToTextPipelineTests(unittest.TestCase):
|
||||
],
|
||||
}
|
||||
],
|
||||
"generated_text": "In the image, a woman is sitting on the sandy beach, her legs crossed in a relaxed manner",
|
||||
"generated_text": "In the image, a woman is sitting on the",
|
||||
}
|
||||
],
|
||||
)
|
||||
@@ -263,7 +250,7 @@ class ImageTextToTextPipelineTests(unittest.TestCase):
|
||||
@slow
|
||||
@require_torch
|
||||
def test_model_pt_chat_template_image_url(self):
|
||||
pipe = pipeline("image-text-to-text", model="llava-hf/llava-onevision-qwen2-0.5b-ov-hf")
|
||||
pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
@@ -279,7 +266,7 @@ class ImageTextToTextPipelineTests(unittest.TestCase):
|
||||
}
|
||||
]
|
||||
outputs = pipe(text=messages, return_full_text=False, max_new_tokens=10)[0]["generated_text"]
|
||||
self.assertEqual(outputs, "The image captures the iconic Statue of Liberty, a")
|
||||
self.assertEqual(outputs, "A statue of liberty in the foreground of a city")
|
||||
|
||||
@slow
|
||||
@require_torch
|
||||
|
||||
Reference in New Issue
Block a user