Process inputs directly in apply_chat_template in image-text-to-text pipeline (#35616)

* tokenize inputs directly in apply_chat_template

* refactor processing

* revert changes processing llava

* Update docs

* fix issue with str being iterable

* add test chat text only

* change function name
This commit is contained in:
Yoni Gozlan
2025-04-23 13:31:33 -04:00
committed by GitHub
parent 80ea2c05c2
commit 5cd6b64059
3 changed files with 186 additions and 54 deletions

View File

@@ -66,6 +66,78 @@ class ImageTextToTextPipelineTests(unittest.TestCase):
],
)
@require_torch
def test_small_model_pt_token_text_only(self):
pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
text = "What is the capital of France? Assistant:"
outputs = pipe(text=text)
self.assertEqual(
outputs,
[
{
"input_text": "What is the capital of France? Assistant:",
"generated_text": "What is the capital of France? Assistant: The capital of France is Paris.",
}
],
)
messages = [
[
{
"role": "user",
"content": [
{"type": "text", "text": "Write a poem on Hugging Face, the company"},
],
},
],
[
{
"role": "user",
"content": [
{"type": "text", "text": "What is the capital of France?"},
],
},
],
]
outputs = pipe(text=messages)
self.assertEqual(
outputs,
[
[
{
"input_text": [
{
"role": "user",
"content": [{"type": "text", "text": "Write a poem on Hugging Face, the company"}],
}
],
"generated_text": [
{
"role": "user",
"content": [{"type": "text", "text": "Write a poem on Hugging Face, the company"}],
},
{
"role": "assistant",
"content": "Hugging Face, a company of minds\nWith tools and services that make our lives easier\nFrom",
},
],
}
],
[
{
"input_text": [
{"role": "user", "content": [{"type": "text", "text": "What is the capital of France?"}]}
],
"generated_text": [
{"role": "user", "content": [{"type": "text", "text": "What is the capital of France?"}]},
{"role": "assistant", "content": "Paris"},
],
}
],
],
)
@require_torch
def test_small_model_pt_token(self):
pipe = pipeline("image-text-to-text", model="llava-hf/llava-interleave-qwen-0.5b-hf")
@@ -124,7 +196,7 @@ class ImageTextToTextPipelineTests(unittest.TestCase):
],
}
]
outputs = pipe([image_ny, image_chicago], text=messages, return_full_text=False, max_new_tokens=10)
outputs = pipe([image_ny, image_chicago], text=messages, return_full_text=True, max_new_tokens=10)
self.assertEqual(
outputs,
[
@@ -134,12 +206,37 @@ class ImageTextToTextPipelineTests(unittest.TestCase):
"role": "user",
"content": [
{"type": "text", "text": "Whats the difference between these two images?"},
{"type": "image"},
{"type": "image"},
{
"type": "image",
"image": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
},
{
"type": "image",
"image": "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg",
},
],
}
],
"generated_text": "The first image shows a statue of Liberty in the",
"generated_text": [
{
"role": "user",
"content": [
{"type": "text", "text": "Whats the difference between these two images?"},
{
"type": "image",
"image": "https://cdn.britannica.com/61/93061-050-99147DCE/Statue-of-Liberty-Island-New-York-Bay.jpg",
},
{
"type": "image",
"image": "https://cdn.britannica.com/59/94459-050-DBA42467/Skyline-Chicago.jpg",
},
],
},
{
"role": "assistant",
"content": "The first image shows a statue of Liberty in the",
},
],
}
],
)