🔴 [VLM] Add base model without head (#37033)

* i guessreverted all CdGen classes

* style

* llava onevision

* fix copies

* fix some tests

* some more tests

* dump

* skip these

* nevermind, i am dumb

* revert fix not needed

* fixup

* fixup

* another fixup

* more fixup to make ci finally happy

* fixup after rebasing

* fix qwen tests

* add internVL + typos here and there

* image token index -> id

* style

* fix init weights

* revert blip-2 not supported

* address comments

* fix copies

* revert blip2 test file as well

* as discussed internally, revert back CdGen models

* fix some tests

* fix more tests for compile

* CI red

* fix copies

* enumerate explicitly allowed models

* address comments

* fix tests

* fixup

* style again

* add tests for new model class

* another fixup ( x _ x )

* [fixup] unused attributes can be removed post-deprecation
This commit is contained in:
Raushan Turganbay
2025-05-07 17:47:51 +02:00
committed by GitHub
parent 3fa8d9c20e
commit 17742bd9c8
85 changed files with 7590 additions and 2904 deletions

View File

@@ -13,6 +13,7 @@
# limitations under the License.
"""Testing suite for the PyTorch Llava-NeXT model."""
import copy
import unittest
import requests
@@ -23,6 +24,7 @@ from transformers import (
AutoProcessor,
LlavaNextConfig,
LlavaNextForConditionalGeneration,
LlavaNextModel,
is_torch_available,
is_vision_available,
)
@@ -181,7 +183,14 @@ class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
Model tester for `LlavaNextForConditionalGeneration`.
"""
all_model_classes = (LlavaNextForConditionalGeneration,) if is_torch_available() else ()
all_model_classes = (
(
LlavaNextModel,
LlavaNextForConditionalGeneration,
)
if is_torch_available()
else ()
)
pipeline_model_mapping = {"image-text-to-text": LlavaNextForConditionalGeneration} if is_torch_available() else {}
test_pruning = False
test_head_masking = False
@@ -265,18 +274,19 @@ class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config).to(torch_device)
_ = model(**input_dict) # successful forward with no modifications
curr_input_dict = copy.deepcopy(input_dict) # in=place modifications further
_ = model(**curr_input_dict) # successful forward with no modifications
# remove one image but leave the image token in text
input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...]
input_dict["image_sizes"] = input_dict["image_sizes"][-1:, ...]
curr_input_dict["pixel_values"] = curr_input_dict["pixel_values"][-1:, ...]
curr_input_dict["image_sizes"] = curr_input_dict["image_sizes"][-1:, ...]
with self.assertRaises(ValueError):
_ = model(**input_dict)
_ = model(**curr_input_dict)
# simulate multi-image case by concatenating inputs where each has exactly one image/image-token
input_ids = input_dict["input_ids"][:1]
pixel_values = input_dict["pixel_values"][:1]
image_sizes = input_dict["image_sizes"][:1]
input_ids = curr_input_dict["input_ids"][:1]
pixel_values = curr_input_dict["pixel_values"][:1]
image_sizes = curr_input_dict["image_sizes"][:1]
input_ids = torch.cat([input_ids, input_ids], dim=0)
# one image and two image tokens raise an error
@@ -324,7 +334,8 @@ class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
model = model_class(config).to(torch_device)
# We should have the right number of input features,
# and should be able to run a forward pass without exploding
assert model.multi_modal_projector.linear_1.in_features == expected_features
base_model = getattr(model, "model", model)
assert base_model.multi_modal_projector.linear_1.in_features == expected_features
model(**input_dict)
@unittest.skip(