Uniformize kwargs for chameleon processor (#32181)

* uniformize kwargs of Chameleon

* fix linter nit

* rm stride default

* add tests for chameleon processor

* fix tests

* add comment on get_component

* rm Chameleon's slow tokenizer

* add check order images text + nit

* update docs and tests

* Fix LlamaTokenizer tests

* fix gated repo access

* fix wrong import

---------

Co-authored-by: yonigozlan <yoni.gozlan@huggingface.co>
This commit is contained in:
Franz Louis Cesista
2024-09-26 22:18:07 +08:00
committed by GitHub
parent f2c388e3f9
commit 0a21381ba3
6 changed files with 102 additions and 53 deletions

View File

@@ -78,7 +78,7 @@ url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
image = Image.open(requests.get(url, stream=True).raw)
prompt = "What do you see in this image?<image>"
inputs = processor(prompt, image, return_tensors="pt").to(model.device, dtype=torch.bfloat16)
inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device, dtype=torch.bfloat16)
# autoregressively complete prompt
output = model.generate(**inputs, max_new_tokens=50)
@@ -117,7 +117,7 @@ prompts = [
# We can simply feed images in the order they have to be used in the text prompt
# Each "<image>" token uses one image leaving the next for the subsequent "<image>" tokens
inputs = processor(text=prompts, images=[image_stop, image_cats, image_snowman], padding=True, return_tensors="pt").to(device="cuda", dtype=torch.bfloat16)
inputs = processor(images=[image_stop, image_cats, image_snowman], text=prompts, padding=True, return_tensors="pt").to(device="cuda", dtype=torch.bfloat16)
# Generate
generate_ids = model.generate(**inputs, max_new_tokens=50)

View File

@@ -24,7 +24,7 @@ from PIL import Image
from transformers import (
ChameleonConfig,
ChameleonForCausalLM,
ChameleonForConditionalGeneration,
ChameleonImageProcessor,
ChameleonProcessor,
)
@@ -49,10 +49,10 @@ python src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py \
Thereafter, models can be loaded via:
```py
from transformers import ChameleonForCausalLM, LlamaTokenizer
from transformers import ChameleonForConditionalGeneration, LlamaTokenizerFast
model = ChameleonForCausalLM.from_pretrained("/output/path")
tokenizer = LlamaTokenizer.from_pretrained("/output/path")
model = ChameleonForConditionalGeneration.from_pretrained("/output/path")
tokenizer = LlamaTokenizerFast.from_pretrained("/output/path")
```
Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
@@ -372,7 +372,7 @@ def write_model(model_path, input_base_path, model_size, chameleon_version=1):
vocabulary_map=vocabulary_map,
)
with init_empty_weights():
model = ChameleonForCausalLM(config)
model = ChameleonForConditionalGeneration(config)
model.load_state_dict(state_dict, assign=True, strict=False)
model.save_pretrained(model_path, safe_serialization=True)
@@ -397,7 +397,7 @@ def write_model(model_path, input_base_path, model_size, chameleon_version=1):
# taken from https://github.com/facebookresearch/chameleon/blob/7a72f40aa5f462965c8374f25257f55b65b25ff4/data/prompts_for_human_evaluations.jsonl
print("Loading the checkpoint in a Chameleon model...")
print("*" * 100)
model = ChameleonForCausalLM.from_pretrained(
model = ChameleonForConditionalGeneration.from_pretrained(
model_path, attn_implementation="eager", torch_dtype=torch.bfloat16, device_map="auto"
)
processor = ChameleonProcessor.from_pretrained(model_path)

View File

@@ -1568,7 +1568,7 @@ class ChameleonForConditionalGeneration(ChameleonPreTrainedModel, GenerationMixi
>>> image = Image.open(requests.get("https://nineplanets.org/wp-content/uploads/2020/12/the-big-dipper-1.jpg", stream=True).raw)
>>> image_2 = Image.open(requests.get("https://www.kxan.com/wp-content/uploads/sites/40/2020/10/ORION.jpg", stream=True).raw)
>>> inputs = processor(prompt, images=[image, image_2], return_tensors="pt").to(model.device, torch.bfloat16)
>>> inputs = processor(images=[image, image_2], text=prompt, return_tensors="pt").to(model.device, torch.bfloat16)
>>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
>>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]

View File

@@ -20,9 +20,25 @@ from typing import List, Optional, Union
from ...feature_extraction_utils import BatchFeature
from ...image_utils import ImageInput
from ...processing_utils import ProcessorMixin
from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
from ...utils import TensorType
from ...processing_utils import ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack, _validate_images_text_input_order
from ...tokenization_utils_base import PreTokenizedInput, TextInput
class ChameleonTextKwargs(TextKwargs, total=False):
return_for_text_completion: bool
class ChameleonProcessorKwargs(ProcessingKwargs, total=False):
text_kwargs: ChameleonTextKwargs
_defaults = {
"text_kwargs": {
"padding": False,
"return_for_text_completion": False,
},
"common_kwargs": {
"return_tensors": "pt",
},
}
class ChameleonProcessor(ProcessorMixin):
@@ -57,13 +73,11 @@ class ChameleonProcessor(ProcessorMixin):
def __call__(
self,
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
images: ImageInput = None,
padding: Union[bool, str, PaddingStrategy] = False,
truncation: Union[bool, str, TruncationStrategy] = None,
max_length: int = None,
return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
return_for_text_completion: bool = False,
images: Optional[ImageInput] = None,
text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
audio=None,
videos=None,
**kwargs: Unpack[ChameleonProcessorKwargs],
) -> BatchFeature:
"""
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
@@ -73,26 +87,13 @@ class ChameleonProcessor(ProcessorMixin):
of the above two methods for more information.
Args:
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. Both channels-first and channels-last formats are supported.
text (`str`, `List[str]`, `List[List[str]]`):
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
tensor. Both channels-first and channels-last formats are supported.
padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
Select a strategy to pad the returned sequences (according to the model's padding side and padding
index) among:
- `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
sequence if provided).
- `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
acceptable input length for the model if that argument is not provided.
- `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
lengths).
max_length (`int`, *optional*):
Maximum length of the returned list and optionally padding length (see above).
truncation (`bool`, *optional*):
Activates truncation to cut input sequences longer than `max_length` to `max_length`.
return_tensors (`str` or [`~utils.TensorType`], *optional*):
If set, will return tensors of a particular framework. Acceptable values are:
@@ -110,10 +111,21 @@ class ChameleonProcessor(ProcessorMixin):
`None`).
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
"""
# check if images and text inputs are reversed for BC
images, text = _validate_images_text_input_order(images, text)
if isinstance(text, str):
text = [text]
elif not isinstance(text, list) and not isinstance(text[0], str):
raise TypeError("Invalid input text. Please provide a string, or a list of strings")
if text is None and images is None:
raise ValueError("You must provide either text or images")
output_kwargs = self._merge_kwargs(
ChameleonProcessorKwargs,
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
**kwargs,
)
return_for_text_completion = output_kwargs["text_kwargs"].pop("return_for_text_completion", False)
# Replace the image token with the expanded image token sequence
prompt_strings = []
@@ -124,19 +136,12 @@ class ChameleonProcessor(ProcessorMixin):
sample += self.tokenizer.sep_token # special Chameleon treatment to add sep for chat mode
prompt_strings.append(sample)
data = self.tokenizer(
prompt_strings,
return_tensors=return_tensors,
padding=padding,
truncation=truncation,
max_length=max_length,
)
data = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
if images is not None:
pixel_values = self.image_processor(images, return_tensors=return_tensors)["pixel_values"]
data["pixel_values"] = pixel_values
data["pixel_values"] = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"]
return BatchFeature(data=data, tensor_type=return_tensors)
return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"]["return_tensors"])
# Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
def batch_decode(self, *args, **kwargs):

View File

@@ -350,7 +350,7 @@ class ChameleonModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
processor.tokenizer.padding_side = "right"
inputs = processor(texts, return_tensors="pt", padding=True).to(0)
inputs = processor(text=texts, return_tensors="pt", padding=True).to(0)
output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False)
output_native = processor.tokenizer.batch_decode(output_native)
@@ -392,7 +392,7 @@ class ChameleonIntegrationTest(unittest.TestCase):
)
prompt = "<image>Describe what do you see here and tell me about the history behind it?"
inputs = processor(prompt, images=image, return_tensors="pt").to(model.device, torch.float16)
inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device, torch.float16)
# greedy generation outputs
EXPECTED_TEXT_COMPLETION = ['Describe what do you see here and tell me about the history behind it?The image depicts a star map, with a bright blue line extending across the center of the image. The line is labeled "390 light years" and is accompanied by a small black and'] # fmt: skip
@@ -420,7 +420,7 @@ class ChameleonIntegrationTest(unittest.TestCase):
"What constellation is this image showing?<image>",
]
inputs = processor(prompts, images=[image, image_2], padding=True, return_tensors="pt").to(
inputs = processor(images=[image, image_2], text=prompts, padding=True, return_tensors="pt").to(
model.device, torch.float16
)
@@ -450,7 +450,7 @@ class ChameleonIntegrationTest(unittest.TestCase):
)
prompt = "What do these two images have in common?<image><image>"
inputs = processor(prompt, images=[image, image_2], return_tensors="pt").to(model.device, torch.float16)
inputs = processor(images=[image, image_2], text=prompt, return_tensors="pt").to(model.device, torch.float16)
# greedy generation outputs
EXPECTED_TEXT_COMPLETION = ['What do these two images have in common?The two images show a connection between two things that are not necessarily related. The first image shows a group of stars, while the second image shows a network of lines connecting two points. The connection between'] # fmt: skip

View File

@@ -0,0 +1,44 @@
# coding=utf-8
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Testing suite for the PyTorch chameleon model."""
import tempfile
import unittest
from transformers import ChameleonProcessor, LlamaTokenizer
from transformers.testing_utils import get_tests_dir
from transformers.utils import is_vision_available
from ...test_processing_common import ProcessorTesterMixin
if is_vision_available():
from transformers import ChameleonImageProcessor
SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
class ChameleonProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor_class = ChameleonProcessor
def setUp(self):
self.tmpdirname = tempfile.mkdtemp()
image_processor = ChameleonImageProcessor()
tokenizer = LlamaTokenizer(vocab_file=SAMPLE_VOCAB)
tokenizer.pad_token_id = 0
tokenizer.sep_token_id = 1
processor = self.processor_class(image_processor=image_processor, tokenizer=tokenizer)
processor.save_pretrained(self.tmpdirname)