Uniformize kwargs for Pixtral processor (#33521)
* add uniformized pixtral and kwargs * update doc * fix _validate_images_text_input_order * nit
This commit is contained in:
@@ -11,14 +11,21 @@
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
import shutil
|
||||
import tempfile
|
||||
import unittest
|
||||
|
||||
import requests
|
||||
import torch
|
||||
|
||||
from transformers.testing_utils import require_vision
|
||||
from transformers.testing_utils import (
|
||||
require_torch,
|
||||
require_vision,
|
||||
)
|
||||
from transformers.utils import is_vision_available
|
||||
|
||||
from ...test_processing_common import ProcessorTesterMixin
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
from PIL import Image
|
||||
@@ -27,7 +34,7 @@ if is_vision_available():
|
||||
|
||||
|
||||
@require_vision
|
||||
class PixtralProcessorTest(unittest.TestCase):
|
||||
class PixtralProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||
processor_class = PixtralProcessor
|
||||
|
||||
@classmethod
|
||||
@@ -40,15 +47,20 @@ class PixtralProcessorTest(unittest.TestCase):
|
||||
cls.image_2 = Image.open(requests.get(cls.url_2, stream=True).raw)
|
||||
|
||||
def setUp(self):
|
||||
super().setUp()
|
||||
self.tmpdirname = tempfile.mkdtemp()
|
||||
|
||||
# FIXME - just load the processor directly from the checkpoint
|
||||
tokenizer = AutoTokenizer.from_pretrained("hf-internal-testing/pixtral-12b")
|
||||
image_processor = PixtralImageProcessor()
|
||||
self.processor = PixtralProcessor(tokenizer=tokenizer, image_processor=image_processor)
|
||||
processor = PixtralProcessor(tokenizer=tokenizer, image_processor=image_processor)
|
||||
processor.save_pretrained(self.tmpdirname)
|
||||
|
||||
def tearDown(self):
|
||||
shutil.rmtree(self.tmpdirname)
|
||||
|
||||
@unittest.skip("No chat template was set for this model (yet)")
|
||||
def test_chat_template(self):
|
||||
processor = self.processor_class.from_pretrained(self.tmpdirname)
|
||||
expected_prompt = "USER: [IMG]\nWhat is shown in this image? ASSISTANT:"
|
||||
|
||||
messages = [
|
||||
@@ -60,11 +72,12 @@ class PixtralProcessorTest(unittest.TestCase):
|
||||
],
|
||||
},
|
||||
]
|
||||
formatted_prompt = self.processor.apply_chat_template(messages, add_generation_prompt=True)
|
||||
formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
|
||||
self.assertEqual(expected_prompt, formatted_prompt)
|
||||
|
||||
@unittest.skip("No chat template was set for this model (yet)")
|
||||
def test_image_token_filling(self):
|
||||
processor = self.processor_class.from_pretrained(self.tmpdirname)
|
||||
# Important to check with non square image
|
||||
image = torch.randint(0, 2, (3, 500, 316))
|
||||
expected_image_tokens = 1526
|
||||
@@ -79,8 +92,8 @@ class PixtralProcessorTest(unittest.TestCase):
|
||||
],
|
||||
},
|
||||
]
|
||||
inputs = self.processor(
|
||||
text=[self.processor.apply_chat_template(messages)],
|
||||
inputs = processor(
|
||||
text=[processor.apply_chat_template(messages)],
|
||||
images=[image],
|
||||
return_tensors="pt",
|
||||
)
|
||||
@@ -88,14 +101,15 @@ class PixtralProcessorTest(unittest.TestCase):
|
||||
self.assertEqual(expected_image_tokens, image_tokens)
|
||||
|
||||
def test_processor_with_single_image(self):
|
||||
processor = self.processor_class.from_pretrained(self.tmpdirname)
|
||||
prompt_string = "USER: [IMG]\nWhat's the content of the image? ASSISTANT:"
|
||||
|
||||
# Make small for checking image token expansion
|
||||
self.processor.image_processor.size = {"longest_edge": 30}
|
||||
self.processor.image_processor.patch_size = {"height": 2, "width": 2}
|
||||
processor.image_processor.size = {"longest_edge": 30}
|
||||
processor.image_processor.patch_size = {"height": 2, "width": 2}
|
||||
|
||||
# Test passing in an image
|
||||
inputs_image = self.processor(text=prompt_string, images=self.image_0, return_tensors="pt")
|
||||
inputs_image = processor(text=prompt_string, images=self.image_0, return_tensors="pt")
|
||||
self.assertIn("input_ids", inputs_image)
|
||||
self.assertTrue(len(inputs_image["input_ids"]) == 1)
|
||||
self.assertIsInstance(inputs_image["input_ids"], torch.Tensor)
|
||||
@@ -115,7 +129,7 @@ class PixtralProcessorTest(unittest.TestCase):
|
||||
# fmt: on
|
||||
|
||||
# Test passing in a url
|
||||
inputs_url = self.processor(text=prompt_string, images=self.url_0, return_tensors="pt")
|
||||
inputs_url = processor(text=prompt_string, images=self.url_0, return_tensors="pt")
|
||||
self.assertIn("input_ids", inputs_url)
|
||||
self.assertTrue(len(inputs_url["input_ids"]) == 1)
|
||||
self.assertIsInstance(inputs_url["input_ids"], torch.Tensor)
|
||||
@@ -135,14 +149,15 @@ class PixtralProcessorTest(unittest.TestCase):
|
||||
# fmt: on
|
||||
|
||||
def test_processor_with_multiple_images_single_list(self):
|
||||
processor = self.processor_class.from_pretrained(self.tmpdirname)
|
||||
prompt_string = "USER: [IMG][IMG]\nWhat's the difference between these two images? ASSISTANT:"
|
||||
|
||||
# Make small for checking image token expansion
|
||||
self.processor.image_processor.size = {"longest_edge": 30}
|
||||
self.processor.image_processor.patch_size = {"height": 2, "width": 2}
|
||||
processor.image_processor.size = {"longest_edge": 30}
|
||||
processor.image_processor.patch_size = {"height": 2, "width": 2}
|
||||
|
||||
# Test passing in an image
|
||||
inputs_image = self.processor(text=prompt_string, images=[self.image_0, self.image_1], return_tensors="pt")
|
||||
inputs_image = processor(text=prompt_string, images=[self.image_0, self.image_1], return_tensors="pt")
|
||||
self.assertIn("input_ids", inputs_image)
|
||||
self.assertTrue(len(inputs_image["input_ids"]) == 1)
|
||||
self.assertIsInstance(inputs_image["input_ids"], torch.Tensor)
|
||||
@@ -162,7 +177,7 @@ class PixtralProcessorTest(unittest.TestCase):
|
||||
# fmt: on
|
||||
|
||||
# Test passing in a url
|
||||
inputs_url = self.processor(text=prompt_string, images=[self.url_0, self.url_1], return_tensors="pt")
|
||||
inputs_url = processor(text=prompt_string, images=[self.url_0, self.url_1], return_tensors="pt")
|
||||
self.assertIn("input_ids", inputs_url)
|
||||
self.assertTrue(len(inputs_url["input_ids"]) == 1)
|
||||
self.assertIsInstance(inputs_url["input_ids"], torch.Tensor)
|
||||
@@ -181,19 +196,20 @@ class PixtralProcessorTest(unittest.TestCase):
|
||||
# fmt: on
|
||||
|
||||
def test_processor_with_multiple_images_multiple_lists(self):
|
||||
processor = self.processor_class.from_pretrained(self.tmpdirname)
|
||||
prompt_string = [
|
||||
"USER: [IMG][IMG]\nWhat's the difference between these two images? ASSISTANT:",
|
||||
"USER: [IMG]\nWhat's the content of the image? ASSISTANT:",
|
||||
]
|
||||
self.processor.tokenizer.pad_token = "</s>"
|
||||
processor.tokenizer.pad_token = "</s>"
|
||||
image_inputs = [[self.image_0, self.image_1], [self.image_2]]
|
||||
|
||||
# Make small for checking image token expansion
|
||||
self.processor.image_processor.size = {"longest_edge": 30}
|
||||
self.processor.image_processor.patch_size = {"height": 2, "width": 2}
|
||||
processor.image_processor.size = {"longest_edge": 30}
|
||||
processor.image_processor.patch_size = {"height": 2, "width": 2}
|
||||
|
||||
# Test passing in an image
|
||||
inputs_image = self.processor(text=prompt_string, images=image_inputs, return_tensors="pt", padding=True)
|
||||
inputs_image = processor(text=prompt_string, images=image_inputs, return_tensors="pt", padding=True)
|
||||
self.assertIn("input_ids", inputs_image)
|
||||
self.assertTrue(len(inputs_image["input_ids"]) == 2)
|
||||
self.assertIsInstance(inputs_image["input_ids"], torch.Tensor)
|
||||
@@ -213,7 +229,7 @@ class PixtralProcessorTest(unittest.TestCase):
|
||||
# fmt: on
|
||||
|
||||
# Test passing in a url
|
||||
inputs_url = self.processor(text=prompt_string, images=image_inputs, return_tensors="pt", padding=True)
|
||||
inputs_url = processor(text=prompt_string, images=image_inputs, return_tensors="pt", padding=True)
|
||||
self.assertIn("input_ids", inputs_url)
|
||||
self.assertTrue(len(inputs_url["input_ids"]) == 2)
|
||||
self.assertIsInstance(inputs_url["input_ids"], torch.Tensor)
|
||||
@@ -231,3 +247,145 @@ class PixtralProcessorTest(unittest.TestCase):
|
||||
[21510, 1058, 1032, 10, 10, 12, 10, 10, 13, 10, 10, 12, 10, 10, 13, 1010, 7493, 1681, 1278, 6592, 2396, 2576, 2295, 8061, 1063, 1349, 4290, 16002, 41150, 1058]
|
||||
)
|
||||
# fmt: on
|
||||
|
||||
# Override all tests requiring shape as returning tensor batches is not supported by PixtralProcessor
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_image_processor_defaults_preserved_by_image_kwargs(self):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor", size={"height": 240, "width": 240})
|
||||
tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = "lower newer"
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
inputs = processor(text=input_str, images=image_input)
|
||||
# Added dimension by pixtral image processor
|
||||
self.assertEqual(len(inputs["pixel_values"][0][0][0][0]), 240)
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_kwargs_overrides_default_image_processor_kwargs(self):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor", size={"height": 400, "width": 400})
|
||||
tokenizer = self.get_component("tokenizer", max_length=117, padding="max_length")
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = "lower newer"
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
inputs = processor(text=input_str, images=image_input, size={"height": 240, "width": 240})
|
||||
self.assertEqual(len(inputs["pixel_values"][0][0][0][0]), 240)
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_structured_kwargs_nested(self):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor")
|
||||
tokenizer = self.get_component("tokenizer")
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = "lower newer"
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
# Define the kwargs for each modality
|
||||
all_kwargs = {
|
||||
"common_kwargs": {"return_tensors": "pt"},
|
||||
"images_kwargs": {"size": {"height": 240, "width": 240}},
|
||||
"text_kwargs": {"padding": "max_length", "max_length": 76},
|
||||
}
|
||||
|
||||
inputs = processor(text=input_str, images=image_input, **all_kwargs)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
self.assertEqual(inputs["pixel_values"][0][0].shape[-1], 240)
|
||||
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 76)
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_structured_kwargs_nested_from_dict(self):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
|
||||
image_processor = self.get_component("image_processor")
|
||||
tokenizer = self.get_component("tokenizer")
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
input_str = "lower newer"
|
||||
image_input = self.prepare_image_inputs()
|
||||
|
||||
# Define the kwargs for each modality
|
||||
all_kwargs = {
|
||||
"common_kwargs": {"return_tensors": "pt"},
|
||||
"images_kwargs": {"size": {"height": 240, "width": 240}},
|
||||
"text_kwargs": {"padding": "max_length", "max_length": 76},
|
||||
}
|
||||
|
||||
inputs = processor(text=input_str, images=image_input, **all_kwargs)
|
||||
self.assertEqual(inputs["pixel_values"][0][0].shape[-1], 240)
|
||||
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 76)
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_unstructured_kwargs(self):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor")
|
||||
tokenizer = self.get_component("tokenizer")
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = "lower newer"
|
||||
image_input = self.prepare_image_inputs()
|
||||
inputs = processor(
|
||||
text=input_str,
|
||||
images=image_input,
|
||||
return_tensors="pt",
|
||||
size={"height": 240, "width": 240},
|
||||
padding="max_length",
|
||||
max_length=76,
|
||||
)
|
||||
|
||||
self.assertEqual(inputs["pixel_values"][0][0].shape[-1], 240)
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 76)
|
||||
|
||||
@require_torch
|
||||
@require_vision
|
||||
def test_unstructured_kwargs_batched(self):
|
||||
if "image_processor" not in self.processor_class.attributes:
|
||||
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
|
||||
image_processor = self.get_component("image_processor")
|
||||
tokenizer = self.get_component("tokenizer")
|
||||
|
||||
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
|
||||
self.skip_processor_without_typed_kwargs(processor)
|
||||
|
||||
input_str = ["lower newer", "upper older longer string"]
|
||||
# images needs to be nested to detect multiple prompts
|
||||
image_input = [self.prepare_image_inputs()] * 2
|
||||
inputs = processor(
|
||||
text=input_str,
|
||||
images=image_input,
|
||||
return_tensors="pt",
|
||||
size={"height": 240, "width": 240},
|
||||
padding="longest",
|
||||
max_length=76,
|
||||
)
|
||||
|
||||
self.assertEqual(inputs["pixel_values"][0][0].shape[-1], 240)
|
||||
self.assertEqual(len(inputs["input_ids"][0]), 4)
|
||||
|
||||
Reference in New Issue
Block a user