Expand inputs in processors for VLMs (#30962)

* let it be

* draft

* should not have changed

* add warnings

* fix & add tests

* fix tests

* ipnuts embeds cannot be passed with pixels

* more updates

* paligemma ready!

* minor typos

* update blip-2

* fix tests & raise error

* docstring

* add blip2 test

* tmp

* add image seq length to config

* update docstring

* delete

* fix tests

* fix blip

* fix paligemma

* out-of-place scatter

* add llava-next-video

* Update src/transformers/models/blip_2/modeling_blip_2.py

Co-authored-by: Pablo Montalvo <39954772+molbap@users.noreply.github.com>

* remove tmp

* codestyle

* nits

* more nits

* remove overriding in tests

* comprehension when merging video

* fix-copies

* revert changes for embeds test

* fix tests after making comprehension

* Update src/transformers/models/blip_2/processing_blip_2.py

Co-authored-by: Pablo Montalvo <39954772+molbap@users.noreply.github.com>

* Update src/transformers/models/blip_2/processing_blip_2.py

Co-authored-by: Pablo Montalvo <39954772+molbap@users.noreply.github.com>

* more updates

* fix tests

---------

Co-authored-by: Pablo Montalvo <39954772+molbap@users.noreply.github.com>
This commit is contained in:
Raushan Turganbay
2024-08-13 10:14:39 +05:00
committed by GitHub
parent 2a5a6ad18a
commit a29eabd0eb
37 changed files with 1951 additions and 802 deletions

View File

@@ -1033,3 +1033,33 @@ class Blip2ModelIntegrationTest(unittest.TestCase):
[0, 3, 7, 152, 67, 839, 1],
)
self.assertEqual(generated_text, "san diego")
def test_expansion_in_processing(self):
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
model = Blip2ForConditionalGeneration.from_pretrained(
"Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16
).to(torch_device)
image = prepare_img()
prompt = "Question: which city is this? Answer:"
# Make sure we will go the legacy path by setting these args to None
processor.num_query_tokens = None
model.config.image_token_index = None
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
predictions = model.generate(**inputs, do_sample=False, max_new_tokens=15)
generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
# Add args to the config to trigger new logic when inputs are expanded in processing file
processor.num_query_tokens = model.config.num_query_tokens
processor.tokenizer.add_special_tokens({"additional_special_tokens": ["<image>"]})
model.config.image_token_index = len(processor.tokenizer) - 1
model.resize_token_embeddings(processor.tokenizer.vocab_size, pad_to_multiple_of=64)
# Generate again with new inputs
inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
predictions_expanded = model.generate(**inputs, do_sample=False, max_new_tokens=15)
generated_text_expanded = processor.batch_decode(predictions_expanded, skip_special_tokens=True)[0].strip()
self.assertTrue(generated_text_expanded == generated_text)