Granite Vision Support (#35579)
* Add multimodal granite support Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com> Support multiple image feature layres Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com> * Remove failing validation for visual encoders with no cls Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com> * Update llava based models / configs to support list of feature layers Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com> * Add tests for multiple feature layers Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com> * Use conditional instead of except for misaligned feature shapes Signed-off-by: Alex-Brooks <Alex.brooks@ibm.com> * crop cls from each hidden state Signed-off-by: Alex-Brooks <Alex.brooks@ibm.com> * Fix formatting Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com> * Support single vision feature int in vipllava Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com> * Fix typo in vision feature selection strategy validation Signed-off-by: Alex-Brooks <Alex.brooks@ibm.com> * Add tentative integration test for granite vision models Signed-off-by: Alex-Brooks <Alex.brooks@ibm.com> * Add granite vision docs Replace multimodal granite refs with granite vision Add granite vision / llava next alias Signed-off-by: Alex-Brooks <Alex.brooks@ibm.com> * Use image url in granitevision example Signed-off-by: Alex-Brooks <Alex.brooks@ibm.com> --------- Signed-off-by: Alex-Brooks <Alex.Brooks@ibm.com> Signed-off-by: Alex-Brooks <Alex.brooks@ibm.com>
This commit is contained in:
@@ -17,6 +17,7 @@
|
||||
import unittest
|
||||
|
||||
import requests
|
||||
from parameterized import parameterized
|
||||
|
||||
from transformers import (
|
||||
AutoProcessor,
|
||||
@@ -272,6 +273,32 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterM
|
||||
pixel_values = torch.cat([pixel_values, pixel_values], dim=0)
|
||||
_ = model(input_ids=input_ids, pixel_values=pixel_values)
|
||||
|
||||
@parameterized.expand(
|
||||
[
|
||||
(-1,),
|
||||
([-1],),
|
||||
([-1, -2],),
|
||||
],
|
||||
)
|
||||
def test_vision_feature_layers(self, vision_feature_layer):
|
||||
"""
|
||||
Test that we can use either one vision feature layer, or a list of
|
||||
vision feature layers.
|
||||
"""
|
||||
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
config.vision_feature_layer = vision_feature_layer
|
||||
|
||||
num_feature_layers = 1 if isinstance(vision_feature_layer, int) else len(vision_feature_layer)
|
||||
hidden_size = config.vision_config.hidden_size
|
||||
expected_features = hidden_size * num_feature_layers
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config).to(torch_device)
|
||||
# We should have the right number of input features,
|
||||
# and should be able to run a forward pass without exploding
|
||||
assert model.multi_modal_projector.linear_1.in_features == expected_features
|
||||
model(**input_dict)
|
||||
|
||||
@unittest.skip(
|
||||
reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
)
|
||||
|
||||
@@ -18,6 +18,7 @@ import unittest
|
||||
|
||||
import requests
|
||||
from huggingface_hub import hf_hub_download
|
||||
from parameterized import parameterized
|
||||
|
||||
from transformers import (
|
||||
AutoProcessor,
|
||||
@@ -321,6 +322,32 @@ class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
|
||||
image_sizes = torch.cat([image_sizes, image_sizes], dim=0)
|
||||
_ = model(input_ids=input_ids, pixel_values=pixel_values, image_sizes=image_sizes)
|
||||
|
||||
@parameterized.expand(
|
||||
[
|
||||
(-1,),
|
||||
([-1],),
|
||||
([-1, -2],),
|
||||
],
|
||||
)
|
||||
def test_vision_feature_layers(self, vision_feature_layer):
|
||||
"""
|
||||
Test that we can use either one vision feature layer, or a list of
|
||||
vision feature layers.
|
||||
"""
|
||||
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
config.vision_feature_layer = vision_feature_layer
|
||||
|
||||
num_feature_layers = 1 if isinstance(vision_feature_layer, int) else len(vision_feature_layer)
|
||||
hidden_size = config.vision_config.hidden_size
|
||||
expected_features = hidden_size * num_feature_layers
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config).to(torch_device)
|
||||
# We should have the right number of input features,
|
||||
# and should be able to run a forward pass without exploding
|
||||
assert model.multi_modal_projector.linear_1.in_features == expected_features
|
||||
model(**input_dict)
|
||||
|
||||
@unittest.skip(
|
||||
reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
)
|
||||
@@ -558,3 +585,25 @@ class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
self.processor.decode(output[0], skip_special_tokens=True),
|
||||
EXPECTED_DECODED_TEXT,
|
||||
)
|
||||
|
||||
@unittest.skip(reason="Granite multimodal [vision] models are not yet released")
|
||||
@slow
|
||||
def test_granite_vision(self):
|
||||
"""
|
||||
Check the expected output of a granite vision model, which leverages
|
||||
multiple vision feature layers and a visual encoder with no CLS (siglip).
|
||||
"""
|
||||
# TODO @alex-jw-brooks - update the path and enable this test once the 2b model is released
|
||||
granite_model_path = "llava-granite-2b"
|
||||
model = LlavaNextForConditionalGeneration.from_pretrained(granite_model_path)
|
||||
self.processor = AutoProcessor.from_pretrained(granite_model_path)
|
||||
prompt = "<|user|>\n<image>\nWhat is shown in this image?\n<|assistant|>\n"
|
||||
inputs = self.processor(prompt, self.image, return_tensors="pt").to(model.device)
|
||||
|
||||
# verify generation
|
||||
output = model.generate(**inputs, max_new_tokens=30)
|
||||
EXPECTED_DECODED_TEXT = "<|user|>\n\nWhat is shown in this image?\n<|assistant|>\nThe image depicts a diagram."
|
||||
self.assertEqual(
|
||||
self.processor.decode(output[0], skip_special_tokens=True),
|
||||
EXPECTED_DECODED_TEXT,
|
||||
)
|
||||
|
||||
@@ -18,6 +18,7 @@ import unittest
|
||||
|
||||
import numpy as np
|
||||
from huggingface_hub import hf_hub_download
|
||||
from parameterized import parameterized
|
||||
|
||||
from transformers import (
|
||||
AutoProcessor,
|
||||
@@ -338,6 +339,32 @@ class LlavaNextVideoForConditionalGenerationModelTest(ModelTesterMixin, Generati
|
||||
image_sizes = torch.cat([image_sizes, image_sizes], dim=0)
|
||||
_ = model(input_ids=input_ids, pixel_values=pixel_values, image_sizes=image_sizes)
|
||||
|
||||
@parameterized.expand(
|
||||
[
|
||||
(-1,),
|
||||
([-1],),
|
||||
([-1, -2],),
|
||||
],
|
||||
)
|
||||
def test_vision_feature_layers(self, vision_feature_layer):
|
||||
"""
|
||||
Test that we can use either one vision feature layer, or a list of
|
||||
vision feature layers.
|
||||
"""
|
||||
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
config.vision_feature_layer = vision_feature_layer
|
||||
|
||||
num_feature_layers = 1 if isinstance(vision_feature_layer, int) else len(vision_feature_layer)
|
||||
hidden_size = config.vision_config.hidden_size
|
||||
expected_features = hidden_size * num_feature_layers
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config).to(torch_device)
|
||||
# We should have the right number of input features,
|
||||
# and should be able to run a forward pass without exploding
|
||||
assert model.multi_modal_projector.linear_1.in_features == expected_features
|
||||
model(**input_dict)
|
||||
|
||||
@unittest.skip(
|
||||
reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
)
|
||||
|
||||
@@ -19,6 +19,7 @@ import unittest
|
||||
import numpy as np
|
||||
import requests
|
||||
from huggingface_hub import hf_hub_download
|
||||
from parameterized import parameterized
|
||||
|
||||
from transformers import (
|
||||
AutoProcessor,
|
||||
@@ -292,6 +293,32 @@ class LlavaOnevisionForConditionalGenerationModelTest(ModelTesterMixin, Generati
|
||||
out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
|
||||
self.assertTrue(torch.allclose(out_embeds, out_ids))
|
||||
|
||||
@parameterized.expand(
|
||||
[
|
||||
(-1,),
|
||||
([-1],),
|
||||
([-1, -2],),
|
||||
],
|
||||
)
|
||||
def test_vision_feature_layers(self, vision_feature_layer):
|
||||
"""
|
||||
Test that we can use either one vision feature layer, or a list of
|
||||
vision feature layers.
|
||||
"""
|
||||
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
config.vision_feature_layer = vision_feature_layer
|
||||
|
||||
num_feature_layers = 1 if isinstance(vision_feature_layer, int) else len(vision_feature_layer)
|
||||
hidden_size = config.vision_config.hidden_size
|
||||
expected_features = hidden_size * num_feature_layers
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config).to(torch_device)
|
||||
# We should have the right number of input features,
|
||||
# and should be able to run a forward pass without exploding
|
||||
assert model.multi_modal_projector.linear_1.in_features == expected_features
|
||||
model(**input_dict)
|
||||
|
||||
@unittest.skip(
|
||||
reason="This architecure seem to not compute gradients properly when using GC, SiglipVisionModel does not support standalone training"
|
||||
)
|
||||
|
||||
@@ -19,6 +19,7 @@ import unittest
|
||||
import numpy as np
|
||||
import requests
|
||||
from huggingface_hub import hf_hub_download
|
||||
from parameterized import parameterized
|
||||
|
||||
from transformers import (
|
||||
VideoLlavaConfig,
|
||||
@@ -419,6 +420,32 @@ class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe
|
||||
pixel_values = torch.cat([pixel_values, pixel_values], dim=0)
|
||||
_ = model(input_ids=input_ids, pixel_values_images=pixel_values)
|
||||
|
||||
@parameterized.expand(
|
||||
[
|
||||
(-1,),
|
||||
([-1],),
|
||||
([-1, -2],),
|
||||
],
|
||||
)
|
||||
def test_vision_feature_layers(self, vision_feature_layer):
|
||||
"""
|
||||
Test that we can use either one vision feature layer, or a list of
|
||||
vision feature layers.
|
||||
"""
|
||||
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
config.vision_feature_layer = vision_feature_layer
|
||||
|
||||
num_feature_layers = 1 if isinstance(vision_feature_layer, int) else len(vision_feature_layer)
|
||||
hidden_size = config.vision_config.hidden_size
|
||||
expected_features = hidden_size * num_feature_layers
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config).to(torch_device)
|
||||
# We should have the right number of input features,
|
||||
# and should be able to run a forward pass without exploding
|
||||
assert model.multi_modal_projector.linear_1.in_features == expected_features
|
||||
model(**input_dict)
|
||||
|
||||
|
||||
@require_torch
|
||||
class VideoLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
|
||||
@@ -17,6 +17,7 @@
|
||||
import unittest
|
||||
|
||||
import requests
|
||||
from parameterized import parameterized
|
||||
|
||||
from transformers import (
|
||||
AutoProcessor,
|
||||
@@ -257,6 +258,37 @@ class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTest
|
||||
pixel_values = torch.cat([pixel_values, pixel_values], dim=0)
|
||||
_ = model(input_ids=input_ids, pixel_values=pixel_values)
|
||||
|
||||
@parameterized.expand(
|
||||
[
|
||||
(-1,),
|
||||
([-1],),
|
||||
([-1, -2],),
|
||||
],
|
||||
)
|
||||
def test_vision_feature_layers(self, vision_feature_layers):
|
||||
"""
|
||||
Test that we can use either one vision feature layer, or a list of
|
||||
vision feature layers.
|
||||
"""
|
||||
# NOTE: vipllava uses vision_feature_layers instead of vision_feature_layer as the
|
||||
# config key. The reason is that other llava classes supported one vision feature layer
|
||||
# and added support for a list of layers with granite vision support, while vipllava
|
||||
# originally supported multiple feature layers, and added support for a single layer for
|
||||
# for compatibility reasons.
|
||||
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
config.vision_feature_layers = vision_feature_layers
|
||||
|
||||
num_feature_layers = 1 if isinstance(vision_feature_layers, int) else len(vision_feature_layers)
|
||||
hidden_size = config.vision_config.hidden_size
|
||||
expected_features = hidden_size * num_feature_layers
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config).to(torch_device)
|
||||
# We should have the right number of input features,
|
||||
# and should be able to run a forward pass without exploding
|
||||
assert model.multi_modal_projector.linear_1.in_features == expected_features
|
||||
model(**input_dict)
|
||||
|
||||
@unittest.skip(
|
||||
reason="This architecure seem to not compute gradients properly when using GC, check: https://github.com/huggingface/transformers/pull/27124"
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user