Uniformize kwargs for chameleon processor (#32181)
* uniformize kwargs of Chameleon * fix linter nit * rm stride default * add tests for chameleon processor * fix tests * add comment on get_component * rm Chameleon's slow tokenizer * add check order images text + nit * update docs and tests * Fix LlamaTokenizer tests * fix gated repo access * fix wrong import --------- Co-authored-by: yonigozlan <yoni.gozlan@huggingface.co>
This commit is contained in:
committed by
GitHub
parent
f2c388e3f9
commit
0a21381ba3
@@ -78,7 +78,7 @@ url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
|
|||||||
image = Image.open(requests.get(url, stream=True).raw)
|
image = Image.open(requests.get(url, stream=True).raw)
|
||||||
prompt = "What do you see in this image?<image>"
|
prompt = "What do you see in this image?<image>"
|
||||||
|
|
||||||
inputs = processor(prompt, image, return_tensors="pt").to(model.device, dtype=torch.bfloat16)
|
inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device, dtype=torch.bfloat16)
|
||||||
|
|
||||||
# autoregressively complete prompt
|
# autoregressively complete prompt
|
||||||
output = model.generate(**inputs, max_new_tokens=50)
|
output = model.generate(**inputs, max_new_tokens=50)
|
||||||
@@ -117,7 +117,7 @@ prompts = [
|
|||||||
|
|
||||||
# We can simply feed images in the order they have to be used in the text prompt
|
# We can simply feed images in the order they have to be used in the text prompt
|
||||||
# Each "<image>" token uses one image leaving the next for the subsequent "<image>" tokens
|
# Each "<image>" token uses one image leaving the next for the subsequent "<image>" tokens
|
||||||
inputs = processor(text=prompts, images=[image_stop, image_cats, image_snowman], padding=True, return_tensors="pt").to(device="cuda", dtype=torch.bfloat16)
|
inputs = processor(images=[image_stop, image_cats, image_snowman], text=prompts, padding=True, return_tensors="pt").to(device="cuda", dtype=torch.bfloat16)
|
||||||
|
|
||||||
# Generate
|
# Generate
|
||||||
generate_ids = model.generate(**inputs, max_new_tokens=50)
|
generate_ids = model.generate(**inputs, max_new_tokens=50)
|
||||||
|
|||||||
@@ -24,7 +24,7 @@ from PIL import Image
|
|||||||
|
|
||||||
from transformers import (
|
from transformers import (
|
||||||
ChameleonConfig,
|
ChameleonConfig,
|
||||||
ChameleonForCausalLM,
|
ChameleonForConditionalGeneration,
|
||||||
ChameleonImageProcessor,
|
ChameleonImageProcessor,
|
||||||
ChameleonProcessor,
|
ChameleonProcessor,
|
||||||
)
|
)
|
||||||
@@ -49,10 +49,10 @@ python src/transformers/models/chameleon/convert_chameleon_weights_to_hf.py \
|
|||||||
Thereafter, models can be loaded via:
|
Thereafter, models can be loaded via:
|
||||||
|
|
||||||
```py
|
```py
|
||||||
from transformers import ChameleonForCausalLM, LlamaTokenizer
|
from transformers import ChameleonForConditionalGeneration, LlamaTokenizerFast
|
||||||
|
|
||||||
model = ChameleonForCausalLM.from_pretrained("/output/path")
|
model = ChameleonForConditionalGeneration.from_pretrained("/output/path")
|
||||||
tokenizer = LlamaTokenizer.from_pretrained("/output/path")
|
tokenizer = LlamaTokenizerFast.from_pretrained("/output/path")
|
||||||
```
|
```
|
||||||
|
|
||||||
Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
|
Important note: you need to be able to host the whole model in RAM to execute this script (even if the biggest versions
|
||||||
@@ -372,7 +372,7 @@ def write_model(model_path, input_base_path, model_size, chameleon_version=1):
|
|||||||
vocabulary_map=vocabulary_map,
|
vocabulary_map=vocabulary_map,
|
||||||
)
|
)
|
||||||
with init_empty_weights():
|
with init_empty_weights():
|
||||||
model = ChameleonForCausalLM(config)
|
model = ChameleonForConditionalGeneration(config)
|
||||||
|
|
||||||
model.load_state_dict(state_dict, assign=True, strict=False)
|
model.load_state_dict(state_dict, assign=True, strict=False)
|
||||||
model.save_pretrained(model_path, safe_serialization=True)
|
model.save_pretrained(model_path, safe_serialization=True)
|
||||||
@@ -397,7 +397,7 @@ def write_model(model_path, input_base_path, model_size, chameleon_version=1):
|
|||||||
# taken from https://github.com/facebookresearch/chameleon/blob/7a72f40aa5f462965c8374f25257f55b65b25ff4/data/prompts_for_human_evaluations.jsonl
|
# taken from https://github.com/facebookresearch/chameleon/blob/7a72f40aa5f462965c8374f25257f55b65b25ff4/data/prompts_for_human_evaluations.jsonl
|
||||||
print("Loading the checkpoint in a Chameleon model...")
|
print("Loading the checkpoint in a Chameleon model...")
|
||||||
print("*" * 100)
|
print("*" * 100)
|
||||||
model = ChameleonForCausalLM.from_pretrained(
|
model = ChameleonForConditionalGeneration.from_pretrained(
|
||||||
model_path, attn_implementation="eager", torch_dtype=torch.bfloat16, device_map="auto"
|
model_path, attn_implementation="eager", torch_dtype=torch.bfloat16, device_map="auto"
|
||||||
)
|
)
|
||||||
processor = ChameleonProcessor.from_pretrained(model_path)
|
processor = ChameleonProcessor.from_pretrained(model_path)
|
||||||
|
|||||||
@@ -1568,7 +1568,7 @@ class ChameleonForConditionalGeneration(ChameleonPreTrainedModel, GenerationMixi
|
|||||||
>>> image = Image.open(requests.get("https://nineplanets.org/wp-content/uploads/2020/12/the-big-dipper-1.jpg", stream=True).raw)
|
>>> image = Image.open(requests.get("https://nineplanets.org/wp-content/uploads/2020/12/the-big-dipper-1.jpg", stream=True).raw)
|
||||||
>>> image_2 = Image.open(requests.get("https://www.kxan.com/wp-content/uploads/sites/40/2020/10/ORION.jpg", stream=True).raw)
|
>>> image_2 = Image.open(requests.get("https://www.kxan.com/wp-content/uploads/sites/40/2020/10/ORION.jpg", stream=True).raw)
|
||||||
|
|
||||||
>>> inputs = processor(prompt, images=[image, image_2], return_tensors="pt").to(model.device, torch.bfloat16)
|
>>> inputs = processor(images=[image, image_2], text=prompt, return_tensors="pt").to(model.device, torch.bfloat16)
|
||||||
|
|
||||||
>>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
|
>>> generated_ids = model.generate(**inputs, max_new_tokens=100, do_sample=False)
|
||||||
>>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
>>> processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
||||||
|
|||||||
@@ -20,9 +20,25 @@ from typing import List, Optional, Union
|
|||||||
|
|
||||||
from ...feature_extraction_utils import BatchFeature
|
from ...feature_extraction_utils import BatchFeature
|
||||||
from ...image_utils import ImageInput
|
from ...image_utils import ImageInput
|
||||||
from ...processing_utils import ProcessorMixin
|
from ...processing_utils import ProcessingKwargs, ProcessorMixin, TextKwargs, Unpack, _validate_images_text_input_order
|
||||||
from ...tokenization_utils_base import PaddingStrategy, PreTokenizedInput, TextInput, TruncationStrategy
|
from ...tokenization_utils_base import PreTokenizedInput, TextInput
|
||||||
from ...utils import TensorType
|
|
||||||
|
|
||||||
|
class ChameleonTextKwargs(TextKwargs, total=False):
|
||||||
|
return_for_text_completion: bool
|
||||||
|
|
||||||
|
|
||||||
|
class ChameleonProcessorKwargs(ProcessingKwargs, total=False):
|
||||||
|
text_kwargs: ChameleonTextKwargs
|
||||||
|
_defaults = {
|
||||||
|
"text_kwargs": {
|
||||||
|
"padding": False,
|
||||||
|
"return_for_text_completion": False,
|
||||||
|
},
|
||||||
|
"common_kwargs": {
|
||||||
|
"return_tensors": "pt",
|
||||||
|
},
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
class ChameleonProcessor(ProcessorMixin):
|
class ChameleonProcessor(ProcessorMixin):
|
||||||
@@ -57,13 +73,11 @@ class ChameleonProcessor(ProcessorMixin):
|
|||||||
|
|
||||||
def __call__(
|
def __call__(
|
||||||
self,
|
self,
|
||||||
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
|
images: Optional[ImageInput] = None,
|
||||||
images: ImageInput = None,
|
text: Optional[Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]]] = None,
|
||||||
padding: Union[bool, str, PaddingStrategy] = False,
|
audio=None,
|
||||||
truncation: Union[bool, str, TruncationStrategy] = None,
|
videos=None,
|
||||||
max_length: int = None,
|
**kwargs: Unpack[ChameleonProcessorKwargs],
|
||||||
return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
|
|
||||||
return_for_text_completion: bool = False,
|
|
||||||
) -> BatchFeature:
|
) -> BatchFeature:
|
||||||
"""
|
"""
|
||||||
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
||||||
@@ -73,26 +87,13 @@ class ChameleonProcessor(ProcessorMixin):
|
|||||||
of the above two methods for more information.
|
of the above two methods for more information.
|
||||||
|
|
||||||
Args:
|
Args:
|
||||||
|
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
|
||||||
|
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
|
||||||
|
tensor. Both channels-first and channels-last formats are supported.
|
||||||
text (`str`, `List[str]`, `List[List[str]]`):
|
text (`str`, `List[str]`, `List[List[str]]`):
|
||||||
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
||||||
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
||||||
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
||||||
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
|
|
||||||
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
|
|
||||||
tensor. Both channels-first and channels-last formats are supported.
|
|
||||||
padding (`bool`, `str` or [`~utils.PaddingStrategy`], *optional*, defaults to `False`):
|
|
||||||
Select a strategy to pad the returned sequences (according to the model's padding side and padding
|
|
||||||
index) among:
|
|
||||||
- `True` or `'longest'`: Pad to the longest sequence in the batch (or no padding if only a single
|
|
||||||
sequence if provided).
|
|
||||||
- `'max_length'`: Pad to a maximum length specified with the argument `max_length` or to the maximum
|
|
||||||
acceptable input length for the model if that argument is not provided.
|
|
||||||
- `False` or `'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of different
|
|
||||||
lengths).
|
|
||||||
max_length (`int`, *optional*):
|
|
||||||
Maximum length of the returned list and optionally padding length (see above).
|
|
||||||
truncation (`bool`, *optional*):
|
|
||||||
Activates truncation to cut input sequences longer than `max_length` to `max_length`.
|
|
||||||
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
||||||
If set, will return tensors of a particular framework. Acceptable values are:
|
If set, will return tensors of a particular framework. Acceptable values are:
|
||||||
|
|
||||||
@@ -110,10 +111,21 @@ class ChameleonProcessor(ProcessorMixin):
|
|||||||
`None`).
|
`None`).
|
||||||
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
|
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
|
||||||
"""
|
"""
|
||||||
|
# check if images and text inputs are reversed for BC
|
||||||
|
images, text = _validate_images_text_input_order(images, text)
|
||||||
if isinstance(text, str):
|
if isinstance(text, str):
|
||||||
text = [text]
|
text = [text]
|
||||||
elif not isinstance(text, list) and not isinstance(text[0], str):
|
elif not isinstance(text, list) and not isinstance(text[0], str):
|
||||||
raise TypeError("Invalid input text. Please provide a string, or a list of strings")
|
raise TypeError("Invalid input text. Please provide a string, or a list of strings")
|
||||||
|
if text is None and images is None:
|
||||||
|
raise ValueError("You must provide either text or images")
|
||||||
|
|
||||||
|
output_kwargs = self._merge_kwargs(
|
||||||
|
ChameleonProcessorKwargs,
|
||||||
|
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
||||||
|
**kwargs,
|
||||||
|
)
|
||||||
|
return_for_text_completion = output_kwargs["text_kwargs"].pop("return_for_text_completion", False)
|
||||||
|
|
||||||
# Replace the image token with the expanded image token sequence
|
# Replace the image token with the expanded image token sequence
|
||||||
prompt_strings = []
|
prompt_strings = []
|
||||||
@@ -124,19 +136,12 @@ class ChameleonProcessor(ProcessorMixin):
|
|||||||
sample += self.tokenizer.sep_token # special Chameleon treatment to add sep for chat mode
|
sample += self.tokenizer.sep_token # special Chameleon treatment to add sep for chat mode
|
||||||
prompt_strings.append(sample)
|
prompt_strings.append(sample)
|
||||||
|
|
||||||
data = self.tokenizer(
|
data = self.tokenizer(prompt_strings, **output_kwargs["text_kwargs"])
|
||||||
prompt_strings,
|
|
||||||
return_tensors=return_tensors,
|
|
||||||
padding=padding,
|
|
||||||
truncation=truncation,
|
|
||||||
max_length=max_length,
|
|
||||||
)
|
|
||||||
|
|
||||||
if images is not None:
|
if images is not None:
|
||||||
pixel_values = self.image_processor(images, return_tensors=return_tensors)["pixel_values"]
|
data["pixel_values"] = self.image_processor(images, **output_kwargs["images_kwargs"])["pixel_values"]
|
||||||
data["pixel_values"] = pixel_values
|
|
||||||
|
|
||||||
return BatchFeature(data=data, tensor_type=return_tensors)
|
return BatchFeature(data=data, tensor_type=output_kwargs["common_kwargs"]["return_tensors"])
|
||||||
|
|
||||||
# Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
|
# Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Llama
|
||||||
def batch_decode(self, *args, **kwargs):
|
def batch_decode(self, *args, **kwargs):
|
||||||
|
|||||||
@@ -350,7 +350,7 @@ class ChameleonModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
|
|||||||
|
|
||||||
processor.tokenizer.padding_side = "right"
|
processor.tokenizer.padding_side = "right"
|
||||||
|
|
||||||
inputs = processor(texts, return_tensors="pt", padding=True).to(0)
|
inputs = processor(text=texts, return_tensors="pt", padding=True).to(0)
|
||||||
|
|
||||||
output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False)
|
||||||
output_native = processor.tokenizer.batch_decode(output_native)
|
output_native = processor.tokenizer.batch_decode(output_native)
|
||||||
@@ -392,7 +392,7 @@ class ChameleonIntegrationTest(unittest.TestCase):
|
|||||||
)
|
)
|
||||||
prompt = "<image>Describe what do you see here and tell me about the history behind it?"
|
prompt = "<image>Describe what do you see here and tell me about the history behind it?"
|
||||||
|
|
||||||
inputs = processor(prompt, images=image, return_tensors="pt").to(model.device, torch.float16)
|
inputs = processor(images=image, text=prompt, return_tensors="pt").to(model.device, torch.float16)
|
||||||
|
|
||||||
# greedy generation outputs
|
# greedy generation outputs
|
||||||
EXPECTED_TEXT_COMPLETION = ['Describe what do you see here and tell me about the history behind it?The image depicts a star map, with a bright blue line extending across the center of the image. The line is labeled "390 light years" and is accompanied by a small black and'] # fmt: skip
|
EXPECTED_TEXT_COMPLETION = ['Describe what do you see here and tell me about the history behind it?The image depicts a star map, with a bright blue line extending across the center of the image. The line is labeled "390 light years" and is accompanied by a small black and'] # fmt: skip
|
||||||
@@ -420,7 +420,7 @@ class ChameleonIntegrationTest(unittest.TestCase):
|
|||||||
"What constellation is this image showing?<image>",
|
"What constellation is this image showing?<image>",
|
||||||
]
|
]
|
||||||
|
|
||||||
inputs = processor(prompts, images=[image, image_2], padding=True, return_tensors="pt").to(
|
inputs = processor(images=[image, image_2], text=prompts, padding=True, return_tensors="pt").to(
|
||||||
model.device, torch.float16
|
model.device, torch.float16
|
||||||
)
|
)
|
||||||
|
|
||||||
@@ -450,7 +450,7 @@ class ChameleonIntegrationTest(unittest.TestCase):
|
|||||||
)
|
)
|
||||||
prompt = "What do these two images have in common?<image><image>"
|
prompt = "What do these two images have in common?<image><image>"
|
||||||
|
|
||||||
inputs = processor(prompt, images=[image, image_2], return_tensors="pt").to(model.device, torch.float16)
|
inputs = processor(images=[image, image_2], text=prompt, return_tensors="pt").to(model.device, torch.float16)
|
||||||
|
|
||||||
# greedy generation outputs
|
# greedy generation outputs
|
||||||
EXPECTED_TEXT_COMPLETION = ['What do these two images have in common?The two images show a connection between two things that are not necessarily related. The first image shows a group of stars, while the second image shows a network of lines connecting two points. The connection between'] # fmt: skip
|
EXPECTED_TEXT_COMPLETION = ['What do these two images have in common?The two images show a connection between two things that are not necessarily related. The first image shows a group of stars, while the second image shows a network of lines connecting two points. The connection between'] # fmt: skip
|
||||||
|
|||||||
44
tests/models/chameleon/test_processor_chameleon.py
Normal file
44
tests/models/chameleon/test_processor_chameleon.py
Normal file
@@ -0,0 +1,44 @@
|
|||||||
|
# coding=utf-8
|
||||||
|
# Copyright 2024 The HuggingFace Inc. team. All rights reserved.
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
"""Testing suite for the PyTorch chameleon model."""
|
||||||
|
|
||||||
|
import tempfile
|
||||||
|
import unittest
|
||||||
|
|
||||||
|
from transformers import ChameleonProcessor, LlamaTokenizer
|
||||||
|
from transformers.testing_utils import get_tests_dir
|
||||||
|
from transformers.utils import is_vision_available
|
||||||
|
|
||||||
|
from ...test_processing_common import ProcessorTesterMixin
|
||||||
|
|
||||||
|
|
||||||
|
if is_vision_available():
|
||||||
|
from transformers import ChameleonImageProcessor
|
||||||
|
|
||||||
|
|
||||||
|
SAMPLE_VOCAB = get_tests_dir("fixtures/test_sentencepiece.model")
|
||||||
|
|
||||||
|
|
||||||
|
class ChameleonProcessorTest(ProcessorTesterMixin, unittest.TestCase):
|
||||||
|
processor_class = ChameleonProcessor
|
||||||
|
|
||||||
|
def setUp(self):
|
||||||
|
self.tmpdirname = tempfile.mkdtemp()
|
||||||
|
image_processor = ChameleonImageProcessor()
|
||||||
|
tokenizer = LlamaTokenizer(vocab_file=SAMPLE_VOCAB)
|
||||||
|
tokenizer.pad_token_id = 0
|
||||||
|
tokenizer.sep_token_id = 1
|
||||||
|
processor = self.processor_class(image_processor=image_processor, tokenizer=tokenizer)
|
||||||
|
processor.save_pretrained(self.tmpdirname)
|
||||||
Reference in New Issue
Block a user