Expand inputs in processors for VLMs (#30962)

* let it be * draft * should not have changed * add warnings * fix & add tests * fix tests * ipnuts embeds cannot be passed with pixels * more updates * paligemma ready! * minor typos * update blip-2 * fix tests & raise error * docstring * add blip2 test * tmp * add image seq length to config * update docstring * delete * fix tests * fix blip * fix paligemma * out-of-place scatter * add llava-next-video * Update src/transformers/models/blip_2/modeling_blip_2.py Co-authored-by: Pablo Montalvo <39954772+molbap@users.noreply.github.com> * remove tmp * codestyle * nits * more nits * remove overriding in tests * comprehension when merging video * fix-copies * revert changes for embeds test * fix tests after making comprehension * Update src/transformers/models/blip_2/processing_blip_2.py Co-authored-by: Pablo Montalvo <39954772+molbap@users.noreply.github.com> * Update src/transformers/models/blip_2/processing_blip_2.py Co-authored-by: Pablo Montalvo <39954772+molbap@users.noreply.github.com> * more updates * fix tests --------- Co-authored-by: Pablo Montalvo <39954772+molbap@users.noreply.github.com>
2024-08-13 10:14:39 +05:00
parent 2a5a6ad18a
commit a29eabd0eb
37 changed files with 1951 additions and 802 deletions
--- a/tests/models/blip_2/test_modeling_blip_2.py
+++ b/tests/models/blip_2/test_modeling_blip_2.py
@@ -1033,3 +1033,33 @@ class Blip2ModelIntegrationTest(unittest.TestCase):
            [0, 3, 7, 152, 67, 839, 1],
        )
        self.assertEqual(generated_text, "san diego")
+
+    def test_expansion_in_processing(self):
+        processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
+        model = Blip2ForConditionalGeneration.from_pretrained(
+            "Salesforce/blip2-opt-2.7b", torch_dtype=torch.float16
+        ).to(torch_device)
+
+        image = prepare_img()
+        prompt = "Question: which city is this? Answer:"
+
+        # Make sure we will go the legacy path by setting these args to None
+        processor.num_query_tokens = None
+        model.config.image_token_index = None
+        inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
+
+        predictions = model.generate(**inputs, do_sample=False, max_new_tokens=15)
+        generated_text = processor.batch_decode(predictions, skip_special_tokens=True)[0].strip()
+
+        # Add args to the config to trigger new logic when inputs are expanded in processing file
+        processor.num_query_tokens = model.config.num_query_tokens
+        processor.tokenizer.add_special_tokens({"additional_special_tokens": ["<image>"]})
+        model.config.image_token_index = len(processor.tokenizer) - 1
+        model.resize_token_embeddings(processor.tokenizer.vocab_size, pad_to_multiple_of=64)
+
+        # Generate again with new inputs
+        inputs = processor(images=image, text=prompt, return_tensors="pt").to(torch_device, dtype=torch.float16)
+        predictions_expanded = model.generate(**inputs, do_sample=False, max_new_tokens=15)
+        generated_text_expanded = processor.batch_decode(predictions_expanded, skip_special_tokens=True)[0].strip()
+
+        self.assertTrue(generated_text_expanded == generated_text)