fix pixtral processor (#34486)
* fix pixtral processor * test out full length batches + remove undue ValueError * fix up processing * fix tests * fix * last fixup * style * [run-slow] pixtral * [run-slow] pixtral * fix config key * skip torchscript tests * [run-slow] pixtral * add missing key * [run-slow] pixtral * fix docs * [run-slow] pixtral * fix wrong url for integration test * [run-slow] pixtral * pixtralVisionModel does not have a lm head * [run-slow] pixtral
This commit is contained in:
@@ -52,6 +52,8 @@ class PixtralVisionConfig(PretrainedConfig):
|
|||||||
Dropout probability for the attention layers.
|
Dropout probability for the attention layers.
|
||||||
rope_theta (`float`, *optional*, defaults to 10000.0):
|
rope_theta (`float`, *optional*, defaults to 10000.0):
|
||||||
The base period of the RoPE embeddings.
|
The base period of the RoPE embeddings.
|
||||||
|
initializer_range (`float`, *optional*, defaults to 0.02):
|
||||||
|
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
||||||
|
|
||||||
Example:
|
Example:
|
||||||
|
|
||||||
@@ -82,6 +84,7 @@ class PixtralVisionConfig(PretrainedConfig):
|
|||||||
hidden_act="gelu",
|
hidden_act="gelu",
|
||||||
attention_dropout=0.0,
|
attention_dropout=0.0,
|
||||||
rope_theta=10000.0,
|
rope_theta=10000.0,
|
||||||
|
initializer_range=0.02,
|
||||||
**kwargs,
|
**kwargs,
|
||||||
):
|
):
|
||||||
super().__init__(**kwargs)
|
super().__init__(**kwargs)
|
||||||
@@ -97,3 +100,4 @@ class PixtralVisionConfig(PretrainedConfig):
|
|||||||
self.hidden_act = hidden_act
|
self.hidden_act = hidden_act
|
||||||
self.rope_theta = rope_theta
|
self.rope_theta = rope_theta
|
||||||
self.head_dim = hidden_size // num_attention_heads
|
self.head_dim = hidden_size // num_attention_heads
|
||||||
|
self.initializer_range = initializer_range
|
||||||
|
|||||||
@@ -407,7 +407,7 @@ class PixtralPreTrainedModel(PreTrainedModel):
|
|||||||
std = (
|
std = (
|
||||||
self.config.initializer_range
|
self.config.initializer_range
|
||||||
if hasattr(self.config, "initializer_range")
|
if hasattr(self.config, "initializer_range")
|
||||||
else self.config.text_config.initializer_range
|
else self.config.initializer_range
|
||||||
)
|
)
|
||||||
|
|
||||||
if isinstance(module, (nn.Linear, nn.Conv2d)):
|
if isinstance(module, (nn.Linear, nn.Conv2d)):
|
||||||
|
|||||||
@@ -206,14 +206,15 @@ class PixtralProcessor(ProcessorMixin):
|
|||||||
if is_image_or_image_url(images):
|
if is_image_or_image_url(images):
|
||||||
images = [[images]]
|
images = [[images]]
|
||||||
elif isinstance(images, list) and is_image_or_image_url(images[0]):
|
elif isinstance(images, list) and is_image_or_image_url(images[0]):
|
||||||
|
if isinstance(text, list):
|
||||||
|
images = [[im] for im in images]
|
||||||
|
else:
|
||||||
images = [images]
|
images = [images]
|
||||||
elif (
|
elif isinstance(images, list) and isinstance(images[0], list) and is_image_or_image_url(images[0][0]):
|
||||||
not isinstance(images, list)
|
pass
|
||||||
and not isinstance(images[0], list)
|
else:
|
||||||
and not is_image_or_image_url(images[0][0])
|
|
||||||
):
|
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
"Invalid input images. Please provide a single image or a list of images or a list of list of images."
|
"Invalid input images. Please provide a single image, a list of images, or a list of lists of images."
|
||||||
)
|
)
|
||||||
images = [[load_image(im) for im in sample] for sample in images]
|
images = [[load_image(im) for im in sample] for sample in images]
|
||||||
image_inputs = self.image_processor(images, patch_size=self.patch_size, **output_kwargs["images_kwargs"])
|
image_inputs = self.image_processor(images, patch_size=self.patch_size, **output_kwargs["images_kwargs"])
|
||||||
|
|||||||
@@ -14,22 +14,16 @@
|
|||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
"""Testing suite for the PyTorch Pixtral model."""
|
"""Testing suite for the PyTorch Pixtral model."""
|
||||||
|
|
||||||
import gc
|
|
||||||
import unittest
|
import unittest
|
||||||
|
|
||||||
import requests
|
|
||||||
|
|
||||||
from transformers import (
|
from transformers import (
|
||||||
AutoProcessor,
|
|
||||||
PixtralVisionConfig,
|
PixtralVisionConfig,
|
||||||
PixtralVisionModel,
|
PixtralVisionModel,
|
||||||
is_torch_available,
|
is_torch_available,
|
||||||
is_vision_available,
|
is_vision_available,
|
||||||
)
|
)
|
||||||
from transformers.testing_utils import (
|
from transformers.testing_utils import (
|
||||||
require_bitsandbytes,
|
|
||||||
require_torch,
|
require_torch,
|
||||||
slow,
|
|
||||||
torch_device,
|
torch_device,
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -43,7 +37,7 @@ else:
|
|||||||
is_torch_greater_or_equal_than_2_0 = False
|
is_torch_greater_or_equal_than_2_0 = False
|
||||||
|
|
||||||
if is_vision_available():
|
if is_vision_available():
|
||||||
from PIL import Image
|
pass
|
||||||
|
|
||||||
|
|
||||||
class PixtralVisionModelTester:
|
class PixtralVisionModelTester:
|
||||||
@@ -148,6 +142,7 @@ class PixtralVisionModelModelTest(ModelTesterMixin, unittest.TestCase):
|
|||||||
all_model_classes = (PixtralVisionModel,) if is_torch_available() else ()
|
all_model_classes = (PixtralVisionModel,) if is_torch_available() else ()
|
||||||
test_pruning = False
|
test_pruning = False
|
||||||
test_head_masking = False
|
test_head_masking = False
|
||||||
|
test_torchscript = False
|
||||||
|
|
||||||
def setUp(self):
|
def setUp(self):
|
||||||
self.model_tester = PixtralVisionModelTester(self)
|
self.model_tester = PixtralVisionModelTester(self)
|
||||||
@@ -258,35 +253,3 @@ class PixtralVisionModelModelTest(ModelTesterMixin, unittest.TestCase):
|
|||||||
@unittest.skip(reason="Not supported yet")
|
@unittest.skip(reason="Not supported yet")
|
||||||
def test_determinism(self):
|
def test_determinism(self):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
|
|
||||||
@require_torch
|
|
||||||
class PixtralVisionModelIntegrationTest(unittest.TestCase):
|
|
||||||
def setUp(self):
|
|
||||||
self.processor = AutoProcessor.from_pretrained("hf-internal-testing/pixtral-12b")
|
|
||||||
|
|
||||||
def tearDown(self):
|
|
||||||
gc.collect()
|
|
||||||
torch.cuda.empty_cache()
|
|
||||||
|
|
||||||
@slow
|
|
||||||
@require_bitsandbytes
|
|
||||||
def test_small_model_integration_test(self):
|
|
||||||
# Let' s make sure we test the preprocessing to replace what is used
|
|
||||||
model = PixtralVisionModel.from_pretrained("hf-internal-testing/pixtral-12b", load_in_4bit=True)
|
|
||||||
|
|
||||||
prompt = "<s>[INST][IMG]\nWhat are the things I should be cautious about when I visit this place?[/INST]"
|
|
||||||
image_file = "https://pixtral-vl.github.io/static/images/view.jpg"
|
|
||||||
raw_image = Image.open(requests.get(image_file, stream=True).raw)
|
|
||||||
inputs = self.processor(prompt, raw_image, return_tensors="pt")
|
|
||||||
|
|
||||||
EXPECTED_INPUT_IDS = torch.tensor([[1, 32000, 28705, 13, 11123, 28747, 1824, 460, 272, 1722,315, 1023, 347, 13831, 925, 684, 739, 315, 3251, 456,1633, 28804, 13, 4816, 8048, 12738, 28747]]) # fmt: skip
|
|
||||||
self.assertTrue(torch.equal(inputs["input_ids"], EXPECTED_INPUT_IDS))
|
|
||||||
|
|
||||||
output = model.generate(**inputs, max_new_tokens=20)
|
|
||||||
EXPECTED_DECODED_TEXT = "\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT: When visiting this place, there are a few things one should be cautious about. Firstly," # fmt: skip
|
|
||||||
|
|
||||||
self.assertEqual(
|
|
||||||
self.processor.decode(output[0], skip_special_tokens=True),
|
|
||||||
EXPECTED_DECODED_TEXT,
|
|
||||||
)
|
|
||||||
|
|||||||
@@ -246,6 +246,25 @@ class PixtralProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
|||||||
)
|
)
|
||||||
# fmt: on
|
# fmt: on
|
||||||
|
|
||||||
|
def test_processor_returns_full_length_batches(self):
|
||||||
|
# to avoid https://github.com/huggingface/transformers/issues/34204
|
||||||
|
processor = self.processor_class.from_pretrained(self.tmpdirname)
|
||||||
|
prompt_string = [
|
||||||
|
"USER: [IMG]\nWhat's the content of the image? ASSISTANT:",
|
||||||
|
] * 5
|
||||||
|
processor.tokenizer.pad_token = "</s>"
|
||||||
|
image_inputs = [self.image_0] * 5
|
||||||
|
|
||||||
|
# Make small for checking image token expansion
|
||||||
|
processor.image_processor.size = {"longest_edge": 30}
|
||||||
|
processor.image_processor.patch_size = {"height": 2, "width": 2}
|
||||||
|
|
||||||
|
# Test passing in an image
|
||||||
|
inputs_image = processor(text=prompt_string, images=image_inputs, return_tensors="pt", padding=True)
|
||||||
|
self.assertIn("input_ids", inputs_image)
|
||||||
|
self.assertTrue(len(inputs_image["input_ids"]) == 5)
|
||||||
|
self.assertTrue(len(inputs_image["pixel_values"]) == 5)
|
||||||
|
|
||||||
# Override as PixtralProcessor needs nested images to work properly with batched inputs
|
# Override as PixtralProcessor needs nested images to work properly with batched inputs
|
||||||
@require_vision
|
@require_vision
|
||||||
def prepare_image_inputs(self, batch_size: Optional[int] = None):
|
def prepare_image_inputs(self, batch_size: Optional[int] = None):
|
||||||
|
|||||||
Reference in New Issue
Block a user