Uniformize kwargs for LLaVa processor and update docs (#32858)

* Uniformize kwargs for LlaVa and update docs

* Change order of processor inputs in docstring

* Improve BC support for reversed images and text inputs

* cleanup llava processor call docstring

* Add encoded inputs as valid text inputs in reverse input check, add deprecation version in warning

* Put function check reversed images text outside base processor class

* Refactor _validate_images_text_input_order

* Add ProcessingUtilTester

* fix processing and test_processing
This commit is contained in:
Yoni Gozlan
2024-09-16 11:26:26 -04:00
committed by GitHub
parent ce62a41880
commit 2f62146f0e
4 changed files with 104 additions and 48 deletions

View File

@@ -274,7 +274,7 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
prompt = "<image>\nUSER: What are the things I should be cautious about when I visit this place?\nASSISTANT:"
image_file = "https://llava-vl.github.io/static/images/view.jpg"
raw_image = Image.open(requests.get(image_file, stream=True).raw)
inputs = self.processor(prompt, raw_image, return_tensors="pt")
inputs = self.processor(images=raw_image, text=prompt, return_tensors="pt")
EXPECTED_INPUT_IDS = torch.tensor([[1, 32000, 28705, 13, 11123, 28747, 1824, 460, 272, 1722,315, 1023, 347, 13831, 925, 684, 739, 315, 3251, 456,1633, 28804, 13, 4816, 8048, 12738, 28747]]) # fmt: skip
self.assertTrue(torch.equal(inputs["input_ids"], EXPECTED_INPUT_IDS))
@@ -299,7 +299,7 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
prompt = "USER: <image>\nWhat are the things I should be cautious about when I visit this place? ASSISTANT:"
image_file = "https://llava-vl.github.io/static/images/view.jpg"
raw_image = Image.open(requests.get(image_file, stream=True).raw)
inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
output = model.generate(**inputs, max_new_tokens=900, do_sample=False)
EXPECTED_DECODED_TEXT = "USER: \nWhat are the things I should be cautious about when I visit this place? ASSISTANT: When visiting this place, which is a pier or dock extending over a body of water, there are a few things to be cautious about. First, be aware of the weather conditions, as sudden changes in weather can make the pier unsafe to walk on. Second, be mindful of the water depth and any potential hazards, such as submerged rocks or debris, that could cause accidents or injuries. Additionally, be cautious of the tides and currents, as they can change rapidly and pose a risk to swimmers or those who venture too close to the edge of the pier. Finally, be respectful of the environment and other visitors, and follow any posted rules or guidelines for the area." # fmt: skip
@@ -325,7 +325,7 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw)
image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
inputs = processor(prompts, images=[image1, image2], return_tensors="pt", padding=True)
inputs = processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True)
output = model.generate(**inputs, max_new_tokens=20)
@@ -349,7 +349,7 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw)
image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
inputs = self.processor(prompts, images=[image1, image2], return_tensors="pt", padding=True)
inputs = self.processor(images=[image1, image2], text=prompts, return_tensors="pt", padding=True)
output = model.generate(**inputs, max_new_tokens=20)
@@ -381,7 +381,7 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
image1 = Image.open(requests.get("https://llava-vl.github.io/static/images/view.jpg", stream=True).raw)
image2 = Image.open(requests.get("http://images.cocodataset.org/val2017/000000039769.jpg", stream=True).raw)
inputs = processor(prompts, images=[image1, image2, image1], return_tensors="pt", padding=True)
inputs = processor(images=[image1, image2, image1], text=prompts, return_tensors="pt", padding=True)
output = model.generate(**inputs, max_new_tokens=20)
@@ -409,8 +409,8 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
image2 = Image.open(requests.get(url2, stream=True).raw)
inputs = processor(
text=[prompt1, prompt2, prompt3],
images=[image1, image2, image1, image2],
text=[prompt1, prompt2, prompt3],
return_tensors="pt",
padding=True,
).to(torch_device)
@@ -444,7 +444,7 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
image_file = "http://images.cocodataset.org/val2017/000000039769.jpg"
raw_image = Image.open(requests.get(image_file, stream=True).raw)
inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
# Make sure that `generate` works
_ = model.generate(**inputs, max_new_tokens=20)
@@ -510,7 +510,7 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
processor = AutoProcessor.from_pretrained(model_id)
# Prepare inputs with no images
inputs = processor("Hello, I am", return_tensors="pt").to(torch_device)
inputs = processor(text="Hello, I am", return_tensors="pt").to(torch_device)
# Make sure that `generate` works
_ = model.generate(**inputs, max_new_tokens=20)
@@ -554,13 +554,13 @@ class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
# check processing with expansion of inputs
processor.vision_feature_select_strategy = "default"
processor.patch_size = 14
inputs_expanded = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
inputs_expanded = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
self.assertTrue(inputs_expanded.input_ids.shape[-1] == 593)
# check processing without expansion of inputs (legacy behavior)
processor.vision_feature_select_strategy = None
processor.patch_size = None
inputs = processor(prompt, raw_image, return_tensors="pt").to(torch_device, torch.float16)
inputs = processor(images=raw_image, text=prompt, return_tensors="pt").to(torch_device, torch.float16)
self.assertTrue(inputs.input_ids.shape[-1] == 18)
# generate exactly 20 tokens

View File

@@ -11,18 +11,43 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import shutil
import tempfile
import unittest
from transformers.testing_utils import require_vision
from transformers import AutoProcessor, AutoTokenizer, LlamaTokenizerFast, LlavaProcessor
from transformers.testing_utils import require_torch, require_vision
from transformers.utils import is_vision_available
from ...test_processing_common import ProcessorTesterMixin
if is_vision_available():
from transformers import AutoTokenizer, LlavaProcessor
from transformers import CLIPImageProcessor
@require_vision
class LlavaProcessorTest(unittest.TestCase):
class LlavaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
processor_class = LlavaProcessor
def setUp(self):
self.tmpdirname = tempfile.mkdtemp()
image_processor = CLIPImageProcessor(do_center_crop=False)
tokenizer = LlamaTokenizerFast.from_pretrained("huggyllama/llama-7b")
processor = LlavaProcessor(image_processor=image_processor, tokenizer=tokenizer)
processor.save_pretrained(self.tmpdirname)
def get_tokenizer(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).tokenizer
def get_image_processor(self, **kwargs):
return AutoProcessor.from_pretrained(self.tmpdirname, **kwargs).image_processor
def tearDown(self):
shutil.rmtree(self.tmpdirname)
def test_can_load_various_tokenizers(self):
for checkpoint in ["Intel/llava-gemma-2b", "llava-hf/llava-1.5-7b-hf"]:
processor = LlavaProcessor.from_pretrained(checkpoint)
@@ -45,3 +70,29 @@ class LlavaProcessorTest(unittest.TestCase):
formatted_prompt = processor.apply_chat_template(messages, add_generation_prompt=True)
self.assertEqual(expected_prompt, formatted_prompt)
@require_torch
@require_vision
def test_unstructured_kwargs_batched(self):
if "image_processor" not in self.processor_class.attributes:
self.skipTest(f"image_processor attribute not present in {self.processor_class}")
image_processor = self.get_component("image_processor")
tokenizer = self.get_component("tokenizer")
processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
self.skip_processor_without_typed_kwargs(processor)
input_str = ["lower newer", "upper older longer string"]
image_input = self.prepare_image_inputs() * 2
inputs = processor(
images=image_input,
text=input_str,
return_tensors="pt",
size={"height": 214, "width": 214},
padding="longest",
max_length=76,
)
self.assertEqual(inputs["pixel_values"].shape[2], 214)
self.assertEqual(len(inputs["input_ids"][0]), 5)