Fix signatures for processing kwargs (#35105)

* add conversion script * remove pg2 refs * fixup style * small update * get correct scaling * add back missing bos * fix missing config keys * might revert this pos_embeddings * fixup 9b config * fix 9b * fixup 9b conversion for good + add back num_hidden_layers * add correct query scaling for 2b, 9b, 27b * fixup 27b conversion * Additional variant: 27b-896 * Use CPU for conversion to reduce GPU RAM requirements * fix causal mask generation + formatting * fix in-training causal mask generation edge case * trigger CI * update config * update config * update config * update config * update config * update config * update config * update config * update config * move conversion file to main model dir * handle multi-images + bos token * address comments for input ids * revert ci fixes * [run-slow] paligemma * fix * [run-slow] paligemma * skip end 2 end * [run-slow] paligemma --------- Co-authored-by: Pedro Cuenca <pedro@huggingface.co> Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
2024-12-05 18:15:48 +01:00
parent e27465c801
commit a5bb528471
5 changed files with 459 additions and 19 deletions
--- a/tests/models/paligemma/test_processor_paligemma.py
+++ b/tests/models/paligemma/test_processor_paligemma.py
@@ -63,8 +63,8 @@ class PaliGemmaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
        tokenizer = self.get_component("tokenizer")

        processor = self.processor_class(tokenizer=tokenizer, image_processor=image_processor)
-        text_multi_images = "<image><image><bos>Dummy text!"
-        text_single_image = "<image><bos>Dummy text!"
+        text_multi_images = "<image><image>Dummy text!"
+        text_single_image = "<image>Dummy text!"
        text_no_image = "Dummy text!"

        image = self.prepare_image_inputs()
@@ -85,7 +85,7 @@ class PaliGemmaProcessorTest(ProcessorTesterMixin, unittest.TestCase):
            self.assertTrue(out_noimage[k].tolist() == out_multiimages[k].tolist())

        text_batched = ["Dummy text!", "Dummy text!"]
-        text_batched_with_image = ["<image><bos>Dummy text!", "<image><bos>Dummy text!"]
+        text_batched_with_image = ["<image>Dummy text!", "<image>Dummy text!"]
        out_images = processor(text=text_batched_with_image, images=[image, image], return_tensors="np")
        out_noimage_nested = processor(text=text_batched, images=[[image], [image]], return_tensors="np")
        out_noimage = processor(text=text_batched, images=[image, image], return_tensors="np")