Expand inputs in processors for VLMs (#30962)
* let it be * draft * should not have changed * add warnings * fix & add tests * fix tests * ipnuts embeds cannot be passed with pixels * more updates * paligemma ready! * minor typos * update blip-2 * fix tests & raise error * docstring * add blip2 test * tmp * add image seq length to config * update docstring * delete * fix tests * fix blip * fix paligemma * out-of-place scatter * add llava-next-video * Update src/transformers/models/blip_2/modeling_blip_2.py Co-authored-by: Pablo Montalvo <39954772+molbap@users.noreply.github.com> * remove tmp * codestyle * nits * more nits * remove overriding in tests * comprehension when merging video * fix-copies * revert changes for embeds test * fix tests after making comprehension * Update src/transformers/models/blip_2/processing_blip_2.py Co-authored-by: Pablo Montalvo <39954772+molbap@users.noreply.github.com> * Update src/transformers/models/blip_2/processing_blip_2.py Co-authored-by: Pablo Montalvo <39954772+molbap@users.noreply.github.com> * more updates * fix tests --------- Co-authored-by: Pablo Montalvo <39954772+molbap@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
2a5a6ad18a
commit
a29eabd0eb
@@ -53,9 +53,9 @@ class PaliGemmaVisionText2TextModelTester:
|
||||
self,
|
||||
parent,
|
||||
ignore_index=-100,
|
||||
image_token_index=98,
|
||||
image_token_index=0,
|
||||
projector_hidden_act="gelu",
|
||||
seq_length=7,
|
||||
seq_length=25,
|
||||
vision_feature_select_strategy="default",
|
||||
vision_feature_layer=-1,
|
||||
projection_dim=32,
|
||||
@@ -87,8 +87,8 @@ class PaliGemmaVisionText2TextModelTester:
|
||||
is_training=True,
|
||||
vision_config={
|
||||
"use_labels": True,
|
||||
"image_size": 30,
|
||||
"patch_size": 2,
|
||||
"image_size": 20,
|
||||
"patch_size": 5,
|
||||
"num_image_tokens": 4,
|
||||
"num_channels": 3,
|
||||
"is_training": True,
|
||||
@@ -106,6 +106,7 @@ class PaliGemmaVisionText2TextModelTester:
|
||||
):
|
||||
self.parent = parent
|
||||
self.ignore_index = ignore_index
|
||||
# `image_token_index` is set to 0 to pass "resize_embeddings" test, do not modify
|
||||
self.image_token_index = image_token_index
|
||||
self.projector_hidden_act = projector_hidden_act
|
||||
self.vision_feature_select_strategy = vision_feature_select_strategy
|
||||
@@ -157,8 +158,10 @@ class PaliGemmaVisionText2TextModelTester:
|
||||
config, pixel_values = config_and_inputs
|
||||
input_ids = ids_tensor([self.batch_size, self.seq_length], config.text_config.vocab_size - 1) + 1
|
||||
attention_mask = input_ids.ne(1).to(torch_device)
|
||||
# setting the 4 first tokens to be image
|
||||
input_ids[:, :4] = config.image_token_index
|
||||
# set the 16 first tokens to be image, and ensure that no other tokens are image tokens
|
||||
# do not change this unless you modified image size or patch size
|
||||
input_ids = torch.where(input_ids == config.image_token_index, 2, input_ids)
|
||||
input_ids[:, :16] = config.image_token_index
|
||||
inputs_dict = {
|
||||
"pixel_values": pixel_values,
|
||||
"input_ids": input_ids,
|
||||
@@ -185,6 +188,49 @@ class PaliGemmaForConditionalGenerationModelTest(ModelTesterMixin, unittest.Test
|
||||
self.model_tester = PaliGemmaVisionText2TextModelTester(self)
|
||||
self.config_tester = ConfigTester(self, config_class=PaliGemmaConfig, has_text_modality=False)
|
||||
|
||||
# overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
|
||||
def test_inputs_embeds(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
inputs = self._prepare_for_class(inputs_dict, model_class)
|
||||
|
||||
input_ids = inputs["input_ids"]
|
||||
del inputs["input_ids"]
|
||||
del inputs["pixel_values"]
|
||||
|
||||
wte = model.get_input_embeddings()
|
||||
inputs["inputs_embeds"] = wte(input_ids)
|
||||
|
||||
with torch.no_grad():
|
||||
model(**inputs)
|
||||
|
||||
# overwrite inputs_embeds tests because we need to delete "pixel values" for LVLMs
|
||||
# while some other models require pixel_values to be present
|
||||
def test_inputs_embeds_matches_input_ids(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
inputs = self._prepare_for_class(inputs_dict, model_class)
|
||||
input_ids = inputs["input_ids"]
|
||||
del inputs["input_ids"]
|
||||
del inputs["pixel_values"]
|
||||
|
||||
inputs_embeds = model.get_input_embeddings()(input_ids)
|
||||
|
||||
with torch.no_grad():
|
||||
out_ids = model(input_ids=input_ids, **inputs)[0]
|
||||
out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
|
||||
self.assertTrue(torch.allclose(out_embeds, out_ids))
|
||||
|
||||
@unittest.skip(
|
||||
reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user