Expand inputs in processors for VLMs (#30962)
* let it be * draft * should not have changed * add warnings * fix & add tests * fix tests * ipnuts embeds cannot be passed with pixels * more updates * paligemma ready! * minor typos * update blip-2 * fix tests & raise error * docstring * add blip2 test * tmp * add image seq length to config * update docstring * delete * fix tests * fix blip * fix paligemma * out-of-place scatter * add llava-next-video * Update src/transformers/models/blip_2/modeling_blip_2.py Co-authored-by: Pablo Montalvo <39954772+molbap@users.noreply.github.com> * remove tmp * codestyle * nits * more nits * remove overriding in tests * comprehension when merging video * fix-copies * revert changes for embeds test * fix tests after making comprehension * Update src/transformers/models/blip_2/processing_blip_2.py Co-authored-by: Pablo Montalvo <39954772+molbap@users.noreply.github.com> * Update src/transformers/models/blip_2/processing_blip_2.py Co-authored-by: Pablo Montalvo <39954772+molbap@users.noreply.github.com> * more updates * fix tests --------- Co-authored-by: Pablo Montalvo <39954772+molbap@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
2a5a6ad18a
commit
a29eabd0eb
@@ -583,3 +583,33 @@ class InstructBlipVideoModelIntegrationTest(unittest.TestCase):
|
||||
generated_text,
|
||||
"a baby girl wearing glasses is reading a book on the bed 1080p",
|
||||
)
|
||||
|
||||
def test_expansion_in_processing(self):
|
||||
processor = InstructBlipVideoProcessor.from_pretrained("Salesforce/instructblip-vicuna-7b")
|
||||
model = InstructBlipVideoForConditionalGeneration.from_pretrained(
|
||||
"Salesforce/instructblip-vicuna-7b", load_in_8bit=True, low_cpu_mem_usage=True
|
||||
)
|
||||
|
||||
clip = prepare_video()
|
||||
prompt = "Explain what is happening in this short video."
|
||||
|
||||
# Make sure we will go the legacy path by setting these args to None
|
||||
processor.num_query_tokens = None
|
||||
model.config.video_token_index = None
|
||||
inputs = processor(images=clip, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
|
||||
|
||||
predictions = model.generate(**inputs, do_sample=False, max_new_tokens=15)
|
||||
generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
|
||||
|
||||
# Add args to the config to trigger new logic when inputs are expanded in processing file
|
||||
processor.num_query_tokens = model.config.num_query_tokens
|
||||
processor.tokenizer.add_special_tokens({"additional_special_tokens": ["<video>"]})
|
||||
model.config.video_token_index = len(processor.tokenizer) - 1
|
||||
model.resize_token_embeddings(len(processor.tokenizer), pad_to_multiple_of=64)
|
||||
|
||||
# Generate again with new inputs
|
||||
inputs = processor(images=clip, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
|
||||
predictions_expanded = model.generate(**inputs, do_sample=False, max_new_tokens=15)
|
||||
generated_text_expanded = processor.batch_decode(predictions_expanded, skip_special_tokens=True)[0].strip()
|
||||
|
||||
self.assertTrue(generated_text_expanded == generated_text)
|
||||
|
||||
Reference in New Issue
Block a user