🔴 [VLM] Add base model without head (#37033)

* i guessreverted all CdGen classes

* style

* llava onevision

* fix copies

* fix some tests

* some more tests

* dump

* skip these

* nevermind, i am dumb

* revert fix not needed

* fixup

* fixup

* another fixup

* more fixup to make ci finally happy

* fixup after rebasing

* fix qwen tests

* add internVL + typos here and there

* image token index -> id

* style

* fix init weights

* revert blip-2 not supported

* address comments

* fix copies

* revert blip2 test file as well

* as discussed internally, revert back CdGen models

* fix some tests

* fix more tests for compile

* CI red

* fix copies

* enumerate explicitly allowed models

* address comments

* fix tests

* fixup

* style again

* add tests for new model class

* another fixup ( x _ x )

* [fixup] unused attributes can be removed post-deprecation
This commit is contained in:
Raushan Turganbay
2025-05-07 17:47:51 +02:00
committed by GitHub
parent 3fa8d9c20e
commit 17742bd9c8
85 changed files with 7590 additions and 2904 deletions

View File

@@ -21,6 +21,7 @@ import requests
from transformers import (
AriaConfig,
AriaForConditionalGeneration,
AriaModel,
AriaTextConfig,
AutoProcessor,
AutoTokenizer,
@@ -175,7 +176,7 @@ class AriaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMi
Model tester for `AriaForConditionalGeneration`.
"""
all_model_classes = (AriaForConditionalGeneration,) if is_torch_available() else ()
all_model_classes = (AriaModel, AriaForConditionalGeneration) if is_torch_available() else ()
test_pruning = False
test_head_masking = False
_is_composite = True
@@ -281,6 +282,18 @@ class AriaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMi
def test_generate_from_inputs_embeds_with_static_cache(self):
pass
@unittest.skip(reason="Aria uses nn.MHA which is not compatible with offloading")
def test_cpu_offload(self):
pass
@unittest.skip(reason="Aria uses nn.MHA which is not compatible with offloading")
def test_disk_offload_bin(self):
pass
@unittest.skip(reason="Aria uses nn.MHA which is not compatible with offloading")
def test_disk_offload_safetensors(self):
pass
@require_torch
class AriaForConditionalGenerationIntegrationTest(unittest.TestCase):

View File

@@ -46,6 +46,7 @@ if is_torch_available():
from transformers import (
AyaVisionForConditionalGeneration,
AyaVisionModel,
)
@@ -158,7 +159,14 @@ class AyaVisionVisionText2TextModelTester:
@require_torch
class AyaVisionModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (AyaVisionForConditionalGeneration,) if is_torch_available() else ()
all_model_classes = (
(
AyaVisionModel,
AyaVisionForConditionalGeneration,
)
if is_torch_available()
else ()
)
all_generative_model_classes = (AyaVisionForConditionalGeneration,) if is_torch_available() else ()
pipeline_model_mapping = (
{

View File

@@ -46,6 +46,7 @@ if is_torch_available():
from transformers import (
Emu3ForCausalLM,
Emu3ForConditionalGeneration,
Emu3Model,
Emu3Processor,
Emu3TextModel,
)
@@ -310,7 +311,14 @@ class Emu3Vision2TextModelTester:
@require_torch
class Emu3Vision2TextModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (Emu3ForConditionalGeneration,) if is_torch_available() else ()
all_model_classes = (
(
Emu3Model,
Emu3ForConditionalGeneration,
)
if is_torch_available()
else ()
)
pipeline_model_mapping = {}
test_headmasking = False
test_pruning = False
@@ -395,6 +403,10 @@ class Emu3Vision2TextModelTest(ModelTesterMixin, GenerationTesterMixin, Pipeline
def test_generate_with_static_cache(self):
pass
@unittest.skip("Emu3 doesn't support Flex attn yet!")
def test_flex_attention_with_grads(self):
pass
@require_torch
class Emu3IntegrationTest(unittest.TestCase):

View File

@@ -38,7 +38,7 @@ if is_torch_available() and is_vision_available():
if is_torch_available():
from transformers import FuyuForCausalLM
from transformers import FuyuForCausalLM, FuyuModel
class FuyuModelTester:
@@ -145,7 +145,14 @@ class FuyuModelTester:
@require_torch
class FuyuModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (FuyuForCausalLM,) if is_torch_available() else ()
all_model_classes = (
(
FuyuModel,
FuyuForCausalLM,
)
if is_torch_available()
else ()
)
pipeline_model_mapping = (
{"text-generation": FuyuForCausalLM, "image-text-to-text": FuyuForCausalLM} if is_torch_available() else {}
)

View File

@@ -50,6 +50,7 @@ if is_torch_available():
from transformers import (
Gemma3ForCausalLM,
Gemma3ForConditionalGeneration,
Gemma3Model,
Gemma3Processor,
Gemma3TextModel,
)
@@ -148,9 +149,9 @@ class Gemma3Vision2TextModelTester:
self,
parent,
mm_tokens_per_image=2,
image_token_index=1,
boi_token_index=2,
eoi_token_index=3,
image_token_index=4,
boi_token_index=5,
eoi_token_index=6,
seq_length=25,
is_training=True,
vision_config={
@@ -242,7 +243,14 @@ class Gemma3Vision2TextModelTester:
@require_torch
class Gemma3Vision2TextModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
all_model_classes = (Gemma3ForConditionalGeneration,) if is_torch_available() else ()
all_model_classes = (
(
Gemma3Model,
Gemma3ForConditionalGeneration,
)
if is_torch_available()
else ()
)
all_generative_model_classes = (Gemma3ForConditionalGeneration,) if is_torch_available() else ()
test_headmasking = False
test_pruning = False

View File

@@ -34,6 +34,7 @@ if is_torch_available():
from transformers import (
GotOcr2ForConditionalGeneration,
GotOcr2Model,
)
@@ -140,7 +141,14 @@ class GotOcr2VisionText2TextModelTester:
@require_torch
class GotOcr2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (GotOcr2ForConditionalGeneration,) if is_torch_available() else ()
all_model_classes = (
(
GotOcr2Model,
GotOcr2ForConditionalGeneration,
)
if is_torch_available()
else ()
)
pipeline_model_mapping = (
{
"image-to-text": GotOcr2ForConditionalGeneration,
@@ -228,6 +236,10 @@ class GotOcr2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
def test_past_key_values_format(self):
pass
@unittest.skip(reason="Vision backbone doesn't support FLEX yet!")
def test_flex_attention_with_grads(self):
pass
@require_torch
class GotOcr2IntegrationTest(unittest.TestCase):

View File

@@ -54,7 +54,7 @@ if is_torch_available():
import torch
from torch import nn
from transformers import InstructBlipForConditionalGeneration, InstructBlipVisionModel
from transformers import InstructBlipForConditionalGeneration, InstructBlipModel, InstructBlipVisionModel
if is_vision_available():
@@ -460,14 +460,20 @@ class InstructBlipForConditionalGenerationDecoderOnlyModelTester:
"attention_mask": attention_mask,
"qformer_input_ids": qformer_input_ids,
"qformer_attention_mask": qformer_attention_mask,
"labels": input_ids,
}
return config, inputs_dict
@require_torch
class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
all_model_classes = (InstructBlipForConditionalGeneration,) if is_torch_available() else ()
all_model_classes = (
(
InstructBlipModel,
InstructBlipForConditionalGeneration,
)
if is_torch_available()
else ()
)
pipeline_model_mapping = {"image-text-to-text": InstructBlipForConditionalGeneration}
fx_compatible = False
test_head_masking = False

View File

@@ -54,7 +54,11 @@ if is_torch_available():
import torch
from torch import nn
from transformers import InstructBlipVideoForConditionalGeneration, InstructBlipVideoVisionModel
from transformers import (
InstructBlipVideoForConditionalGeneration,
InstructBlipVideoModel,
InstructBlipVideoVisionModel,
)
class InstructBlipVideoVisionModelTester:
@@ -477,7 +481,6 @@ class InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester:
"attention_mask": attention_mask,
"qformer_input_ids": qformer_input_ids,
"qformer_attention_mask": qformer_attention_mask,
"labels": input_ids,
}
return config, inputs_dict
@@ -486,7 +489,9 @@ class InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester:
class InstructBlipVideoForConditionalGenerationDecoderOnlyTest(
ModelTesterMixin, GenerationTesterMixin, unittest.TestCase
):
all_model_classes = (InstructBlipVideoForConditionalGeneration,) if is_torch_available() else ()
all_model_classes = (
(InstructBlipVideoForConditionalGeneration, InstructBlipVideoModel) if is_torch_available() else ()
)
fx_compatible = False
test_head_masking = False
test_pruning = False

View File

@@ -47,9 +47,7 @@ from ...test_pipeline_mixin import PipelineTesterMixin
if is_torch_available():
import torch
from transformers import (
InternVLForConditionalGeneration,
)
from transformers import InternVLForConditionalGeneration, InternVLModel
if is_vision_available():
@@ -191,7 +189,7 @@ class InternVLVisionText2TextModelTester:
@require_torch
class InternVLModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (InternVLForConditionalGeneration,) if is_torch_available() else ()
all_model_classes = (InternVLForConditionalGeneration, InternVLModel) if is_torch_available() else ()
all_generative_model_classes = (InternVLForConditionalGeneration,) if is_torch_available() else ()
pipeline_model_mapping = (
{

View File

@@ -13,6 +13,7 @@
# limitations under the License.
"""Testing suite for the PyTorch Llava model."""
import copy
import unittest
import requests
@@ -23,6 +24,7 @@ from transformers import (
AutoTokenizer,
LlavaConfig,
LlavaForConditionalGeneration,
LlavaModel,
is_torch_available,
is_vision_available,
)
@@ -166,7 +168,14 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterM
Model tester for `LlavaForConditionalGeneration`.
"""
all_model_classes = (LlavaForConditionalGeneration,) if is_torch_available() else ()
all_model_classes = (
(
LlavaModel,
LlavaForConditionalGeneration,
)
if is_torch_available()
else ()
)
pipeline_model_mapping = (
{"image-to-text": LlavaForConditionalGeneration, "image-text-to-text": LlavaForConditionalGeneration}
if is_torch_available()
@@ -238,16 +247,17 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterM
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config).to(torch_device)
_ = model(**input_dict) # successful forward with no modifications
curr_input_dict = copy.deepcopy(input_dict) # in=place modifications further
_ = model(**curr_input_dict) # successful forward with no modifications
# remove one image but leave the image token in text
input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...]
curr_input_dict["pixel_values"] = curr_input_dict["pixel_values"][-1:, ...]
with self.assertRaises(ValueError):
_ = model(**input_dict)
_ = model(**curr_input_dict)
# simulate multi-image case by concatenating inputs where each has exactly one image/image-token
input_ids = input_dict["input_ids"][:1]
pixel_values = input_dict["pixel_values"][:1]
input_ids = curr_input_dict["input_ids"][:1]
pixel_values = curr_input_dict["pixel_values"][:1]
input_ids = torch.cat([input_ids, input_ids], dim=0)
# one image and two image tokens raise an error
@@ -281,7 +291,8 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterM
model = model_class(config).to(torch_device)
# We should have the right number of input features,
# and should be able to run a forward pass without exploding
assert model.multi_modal_projector.linear_1.in_features == expected_features
base_model = getattr(model, "model", model)
assert base_model.multi_modal_projector.linear_1.in_features == expected_features
model(**input_dict)
@unittest.skip(

View File

@@ -13,6 +13,7 @@
# limitations under the License.
"""Testing suite for the PyTorch Llava-NeXT model."""
import copy
import unittest
import requests
@@ -23,6 +24,7 @@ from transformers import (
AutoProcessor,
LlavaNextConfig,
LlavaNextForConditionalGeneration,
LlavaNextModel,
is_torch_available,
is_vision_available,
)
@@ -181,7 +183,14 @@ class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
Model tester for `LlavaNextForConditionalGeneration`.
"""
all_model_classes = (LlavaNextForConditionalGeneration,) if is_torch_available() else ()
all_model_classes = (
(
LlavaNextModel,
LlavaNextForConditionalGeneration,
)
if is_torch_available()
else ()
)
pipeline_model_mapping = {"image-text-to-text": LlavaNextForConditionalGeneration} if is_torch_available() else {}
test_pruning = False
test_head_masking = False
@@ -265,18 +274,19 @@ class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config).to(torch_device)
_ = model(**input_dict) # successful forward with no modifications
curr_input_dict = copy.deepcopy(input_dict) # in=place modifications further
_ = model(**curr_input_dict) # successful forward with no modifications
# remove one image but leave the image token in text
input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...]
input_dict["image_sizes"] = input_dict["image_sizes"][-1:, ...]
curr_input_dict["pixel_values"] = curr_input_dict["pixel_values"][-1:, ...]
curr_input_dict["image_sizes"] = curr_input_dict["image_sizes"][-1:, ...]
with self.assertRaises(ValueError):
_ = model(**input_dict)
_ = model(**curr_input_dict)
# simulate multi-image case by concatenating inputs where each has exactly one image/image-token
input_ids = input_dict["input_ids"][:1]
pixel_values = input_dict["pixel_values"][:1]
image_sizes = input_dict["image_sizes"][:1]
input_ids = curr_input_dict["input_ids"][:1]
pixel_values = curr_input_dict["pixel_values"][:1]
image_sizes = curr_input_dict["image_sizes"][:1]
input_ids = torch.cat([input_ids, input_ids], dim=0)
# one image and two image tokens raise an error
@@ -324,7 +334,8 @@ class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
model = model_class(config).to(torch_device)
# We should have the right number of input features,
# and should be able to run a forward pass without exploding
assert model.multi_modal_projector.linear_1.in_features == expected_features
base_model = getattr(model, "model", model)
assert base_model.multi_modal_projector.linear_1.in_features == expected_features
model(**input_dict)
@unittest.skip(

View File

@@ -13,6 +13,7 @@
# limitations under the License.
"""Testing suite for the PyTorch Llava-NeXT-Video model."""
import copy
import unittest
import numpy as np
@@ -23,6 +24,7 @@ from transformers import (
AutoProcessor,
LlavaNextVideoConfig,
LlavaNextVideoForConditionalGeneration,
LlavaNextVideoModel,
is_torch_available,
is_vision_available,
)
@@ -196,7 +198,14 @@ class LlavaNextVideoForConditionalGenerationModelTest(ModelTesterMixin, Generati
Model tester for `LlavaNextVideoForConditionalGeneration`.
"""
all_model_classes = (LlavaNextVideoForConditionalGeneration,) if is_torch_available() else ()
all_model_classes = (
(
LlavaNextVideoModel,
LlavaNextVideoForConditionalGeneration,
)
if is_torch_available()
else ()
)
test_pruning = False
test_head_masking = False
_is_composite = True
@@ -281,18 +290,19 @@ class LlavaNextVideoForConditionalGenerationModelTest(ModelTesterMixin, Generati
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config).to(torch_device)
_ = model(**input_dict) # successful forward with no modifications
curr_input_dict = copy.deepcopy(input_dict) # in=place modifications further
_ = model(**curr_input_dict) # successful forward with no modifications
# remove one image but leave the image token in text
input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...]
input_dict["image_sizes"] = input_dict["image_sizes"][-1:, ...]
curr_input_dict["pixel_values"] = curr_input_dict["pixel_values"][-1:, ...]
curr_input_dict["image_sizes"] = curr_input_dict["image_sizes"][-1:, ...]
with self.assertRaises(ValueError):
_ = model(**input_dict)
_ = model(**curr_input_dict)
# simulate multi-image case by concatenating inputs where each has exactly one image/image-token
input_ids = input_dict["input_ids"][:1]
pixel_values = input_dict["pixel_values"][:1]
image_sizes = input_dict["image_sizes"][:1]
input_ids = curr_input_dict["input_ids"][:1]
pixel_values = curr_input_dict["pixel_values"][:1]
image_sizes = curr_input_dict["image_sizes"][:1]
input_ids = torch.cat([input_ids, input_ids], dim=0)
# one image and two image tokens raise an error
@@ -340,7 +350,8 @@ class LlavaNextVideoForConditionalGenerationModelTest(ModelTesterMixin, Generati
model = model_class(config).to(torch_device)
# We should have the right number of input features,
# and should be able to run a forward pass without exploding
assert model.multi_modal_projector.linear_1.in_features == expected_features
base_model = getattr(model, "model", model)
assert base_model.multi_modal_projector.linear_1.in_features == expected_features
model(**input_dict)
@unittest.skip(

View File

@@ -24,6 +24,7 @@ from transformers import (
AutoProcessor,
LlavaOnevisionConfig,
LlavaOnevisionForConditionalGeneration,
LlavaOnevisionModel,
is_torch_available,
is_vision_available,
)
@@ -182,7 +183,14 @@ class LlavaOnevisionForConditionalGenerationModelTest(ModelTesterMixin, Generati
Model tester for `LlavaOnevisionForConditionalGeneration`.
"""
all_model_classes = (LlavaOnevisionForConditionalGeneration,) if is_torch_available() else ()
all_model_classes = (
(
LlavaOnevisionModel,
LlavaOnevisionForConditionalGeneration,
)
if is_torch_available()
else ()
)
pipeline_model_mapping = (
{"image-text-to-text": LlavaOnevisionForConditionalGeneration} if is_torch_available() else {}
)
@@ -296,7 +304,8 @@ class LlavaOnevisionForConditionalGenerationModelTest(ModelTesterMixin, Generati
model = model_class(config).to(torch_device)
# We should have the right number of input features,
# and should be able to run a forward pass without exploding
assert model.multi_modal_projector.linear_1.in_features == expected_features
base_model = getattr(model, "model", model)
assert base_model.multi_modal_projector.linear_1.in_features == expected_features
model(**input_dict)
@unittest.skip(

View File

@@ -42,6 +42,7 @@ if is_torch_available():
from transformers import (
Mistral3ForConditionalGeneration,
Mistral3Model,
)
@@ -162,7 +163,14 @@ class Mistral3VisionText2TextModelTester:
@require_torch
class Mistral3ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
all_model_classes = (Mistral3ForConditionalGeneration,) if is_torch_available() else ()
all_model_classes = (
(
Mistral3Model,
Mistral3ForConditionalGeneration,
)
if is_torch_available()
else ()
)
all_generative_model_classes = (Mistral3ForConditionalGeneration,) if is_torch_available() else ()
pipeline_model_mapping = (
{
@@ -278,6 +286,10 @@ class Mistral3ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM
def test_sdpa_can_dispatch_on_flash(self):
pass
@unittest.skip("Pixtral does not support attention interfaces.")
def test_flex_attention_with_grads(self):
pass
@slow
@require_torch_gpu

View File

@@ -25,6 +25,7 @@ from transformers import (
MllamaConfig,
MllamaForCausalLM,
MllamaForConditionalGeneration,
MllamaModel,
is_torch_available,
is_vision_available,
)
@@ -262,7 +263,14 @@ class MllamaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTester
Model tester for `MllamaForConditionalGeneration`.
"""
all_model_classes = (MllamaForConditionalGeneration,) if is_torch_available() else ()
all_model_classes = (
(
MllamaModel,
MllamaForConditionalGeneration,
)
if is_torch_available()
else ()
)
pipeline_model_mapping = {"image-text-to-text": MllamaForConditionalGeneration} if is_torch_available() else ()
test_pruning = False
test_head_masking = False
@@ -325,19 +333,18 @@ class MllamaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTester
# resizing embeddings should result in successful loss computation
config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
model_vocab_size = config.get_text_config().vocab_size
inputs = self._prepare_for_class(inputs, model_class, return_labels=True)
# Resize embeddings and call forward
model.resize_token_embeddings(model_vocab_size + 10)
output = model(
input_ids=inputs["input_ids"],
attention_mask=inputs["attention_mask"],
labels=inputs["labels"],
return_dict=True,
)
self.assertTrue("loss" in output)
model = MllamaForConditionalGeneration(config).to(torch_device)
model_vocab_size = config.get_text_config().vocab_size
inputs = self._prepare_for_class(inputs, MllamaForConditionalGeneration, return_labels=True)
# Resize embeddings and call forward
model.resize_token_embeddings(model_vocab_size + 10)
output = model(
input_ids=inputs["input_ids"],
attention_mask=inputs["attention_mask"],
labels=inputs["labels"],
return_dict=True,
)
self.assertTrue("loss" in output)
def _check_attentions_for_generate(
self, batch_size, attentions, prompt_length, output_length, config, decoder_past_key_values
@@ -409,6 +416,18 @@ class MllamaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTester
def test_assisted_decoding_with_num_logits_to_keep(self):
pass
@unittest.skip(reason="Mllama uses self.weights dirrectly causing device mismatch when offloading`")
def test_cpu_offload(self):
pass
@unittest.skip(reason="Mllama uses self.weights dirrectly causing device mismatch when offloading`")
def test_disk_offload_bin(self):
pass
@unittest.skip(reason="Mllama uses self.weights dirrectly causing device mismatch when offloading`")
def test_disk_offload_safetensors(self):
pass
@pytest.mark.generate
# overridden because mllama is not an encoder-decoder model, but has encoder-decoder-like cache
def test_past_key_values_format(self):
@@ -501,7 +520,7 @@ class MllamaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTester
"""
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
for model_class in self.all_generative_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()

View File

@@ -13,6 +13,7 @@
# limitations under the License.
"""Testing suite for the PyTorch PaliGemma model."""
import copy
import unittest
import requests
@@ -20,6 +21,7 @@ import requests
from transformers import (
PaliGemmaConfig,
PaliGemmaForConditionalGeneration,
PaliGemmaModel,
PaliGemmaProcessor,
is_torch_available,
is_vision_available,
@@ -177,7 +179,14 @@ class PaliGemmaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
Model tester for `PaliGemmaForConditionalGeneration`.
"""
all_model_classes = (PaliGemmaForConditionalGeneration,) if is_torch_available() else ()
all_model_classes = (
(
PaliGemmaModel,
PaliGemmaForConditionalGeneration,
)
if is_torch_available()
else ()
)
pipeline_model_mapping = {"image-text-to-text": PaliGemmaForConditionalGeneration}
fx_compatible = False
test_pruning = False
@@ -242,16 +251,17 @@ class PaliGemmaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config).to(torch_device)
_ = model(**input_dict) # successful forward with no modifications
curr_input_dict = copy.deepcopy(input_dict) # in=place modifications further
_ = model(**curr_input_dict) # successful forward with no modifications
# remove one image but leave the image token in text
input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...]
curr_input_dict["pixel_values"] = curr_input_dict["pixel_values"][-1:, ...]
with self.assertRaises(ValueError):
_ = model(**input_dict)
_ = model(**curr_input_dict)
# simulate multi-image case by concatenating inputs where each has exactly one image/image-token
input_ids = input_dict["input_ids"][:1]
pixel_values = input_dict["pixel_values"][:1]
input_ids = curr_input_dict["input_ids"][:1]
pixel_values = curr_input_dict["pixel_values"][:1]
input_ids = torch.cat([input_ids, input_ids], dim=0)
# one image and two image tokens raise an error

View File

@@ -13,6 +13,7 @@
# limitations under the License.
"""Testing suite for the PyTorch PaliGemma model."""
import copy
import unittest
import pytest
@@ -239,16 +240,17 @@ class PaliGemma2ForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config).to(torch_device)
_ = model(**input_dict) # successful forward with no modifications
curr_input_dict = copy.deepcopy(input_dict) # in=place modifications further
_ = model(**curr_input_dict) # successful forward with no modifications
# remove one image but leave the image token in text
input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...]
curr_input_dict["pixel_values"] = curr_input_dict["pixel_values"][-1:, ...]
with self.assertRaises(ValueError):
_ = model(**input_dict)
_ = model(**curr_input_dict)
# simulate multi-image case by concatenating inputs where each has exactly one image/image-token
input_ids = input_dict["input_ids"][:1]
pixel_values = input_dict["pixel_values"][:1]
input_ids = curr_input_dict["input_ids"][:1]
pixel_values = curr_input_dict["pixel_values"][:1]
input_ids = torch.cat([input_ids, input_ids], dim=0)
# one image and two image tokens raise an error

View File

@@ -13,6 +13,7 @@
# limitations under the License.
"""Testing suite for the PyTorch Qwen2.5-VL model."""
import copy
import gc
import tempfile
import unittest
@@ -23,6 +24,7 @@ from transformers import (
AutoProcessor,
Qwen2_5_VLConfig,
Qwen2_5_VLForConditionalGeneration,
Qwen2_5_VLModel,
is_torch_available,
is_vision_available,
)
@@ -180,17 +182,11 @@ class Qwen2_5_VLVisionText2TextModelTester:
input_ids[input_ids == self.vision_start_token_id] = self.pad_token_id
input_ids[:, self.num_image_tokens] = self.image_token_id
input_ids[:, self.num_image_tokens - 1] = self.vision_start_token_id
labels = torch.zeros(
(self.batch_size, self.seq_length),
dtype=torch.long,
device=torch_device,
)
inputs_dict = {
"pixel_values": pixel_values,
"image_grid_thw": torch.tensor([[1, 1, 1]] * self.batch_size),
"input_ids": input_ids,
"attention_mask": attention_mask,
"labels": labels,
}
return config, inputs_dict
@@ -201,7 +197,14 @@ class Qwen2_5_VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test
Model tester for `Qwen2_5_VLForConditionalGeneration`.
"""
all_model_classes = (Qwen2_5_VLForConditionalGeneration,) if is_torch_available() else ()
all_model_classes = (
(
Qwen2_5_VLModel,
Qwen2_5_VLForConditionalGeneration,
)
if is_torch_available()
else ()
)
test_pruning = False
test_head_masking = False
@@ -236,19 +239,20 @@ class Qwen2_5_VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test
for model_class in self.all_model_classes:
model = model_class(config).to(torch_device)
_ = model(**input_dict) # successful forward with no modifications
curr_input_dict = copy.deepcopy(input_dict)
# remove one image but leave the image token in text
patch_size = config.vision_config.patch_size
one_img_length = (self.model_tester.image_size**2) // (patch_size**2)
input_dict["pixel_values"] = input_dict["pixel_values"][-one_img_length:, ...]
input_dict["image_grid_thw"] = input_dict["image_grid_thw"][-1:, ...]
curr_input_dict["pixel_values"] = curr_input_dict["pixel_values"][-one_img_length:, ...]
curr_input_dict["image_grid_thw"] = curr_input_dict["image_grid_thw"][-1:, ...]
with self.assertRaises(ValueError):
_ = model(**input_dict)
_ = model(**curr_input_dict)
# simulate multi-image case by concatenating inputs where each has exactly one image/image-token
input_ids = input_dict["input_ids"][:1]
pixel_values = input_dict["pixel_values"][:one_img_length]
image_grid_thw = input_dict["image_grid_thw"][:1]
input_ids = curr_input_dict["input_ids"][:1]
pixel_values = curr_input_dict["pixel_values"][:one_img_length]
image_grid_thw = curr_input_dict["image_grid_thw"][:1]
input_ids = torch.cat([input_ids, input_ids], dim=0)
# one image and two image tokens raise an error
@@ -375,6 +379,29 @@ class Qwen2_5_VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test
def test_save_load_fast_init_from_base(self):
pass
# The multimodal base model embeds will not match ids, due to pixel values. We can't change base test
# because in some models `pixel_values` are required. Will be fixed when we add support for merging `embeds+pixels`
# TODO: @raushan
def test_inputs_embeds_matches_input_ids(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
inputs = self._prepare_for_class(inputs_dict, model_class)
input_ids = inputs["input_ids"]
del inputs["input_ids"]
del inputs["pixel_values"]
inputs_embeds = model.get_input_embeddings()(input_ids)
with torch.no_grad():
out_ids = model(input_ids=input_ids, **inputs)[0]
out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
torch.testing.assert_close(out_embeds, out_ids)
@require_torch
class Qwen2_5_VLIntegrationTest(unittest.TestCase):

View File

@@ -13,6 +13,7 @@
# limitations under the License.
"""Testing suite for the PyTorch Qwen2-VL model."""
import copy
import gc
import unittest
@@ -22,6 +23,7 @@ from transformers import (
AutoProcessor,
Qwen2VLConfig,
Qwen2VLForConditionalGeneration,
Qwen2VLModel,
is_torch_available,
is_vision_available,
)
@@ -169,17 +171,12 @@ class Qwen2VLVisionText2TextModelTester:
input_ids[input_ids == self.vision_start_token_id] = self.pad_token_id
input_ids[:, self.num_image_tokens] = self.image_token_id
input_ids[:, self.num_image_tokens - 1] = self.vision_start_token_id
labels = torch.zeros(
(self.batch_size, self.seq_length),
dtype=torch.long,
device=torch_device,
)
inputs_dict = {
"pixel_values": pixel_values,
"image_grid_thw": torch.tensor([[1, 1, 1]] * self.batch_size),
"input_ids": input_ids,
"attention_mask": attention_mask,
"labels": labels,
}
return config, inputs_dict
@@ -190,7 +187,14 @@ class Qwen2VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCas
Model tester for `Qwen2VLForConditionalGeneration`.
"""
all_model_classes = (Qwen2VLForConditionalGeneration,) if is_torch_available() else ()
all_model_classes = (
(
Qwen2VLModel,
Qwen2VLForConditionalGeneration,
)
if is_torch_available()
else ()
)
pipeline_model_mapping = {"image-text-to-text": Qwen2VLForConditionalGeneration}
test_pruning = False
test_head_masking = False
@@ -226,20 +230,21 @@ class Qwen2VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCas
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config).to(torch_device)
_ = model(**input_dict) # successful forward with no modifications
curr_input_dict = copy.deepcopy(input_dict)
_ = model(**curr_input_dict) # successfull forward with no modifications
# remove one image but leave the image token in text
patch_size = config.vision_config.patch_size
one_img_length = (self.model_tester.image_size**2) // (patch_size**2)
input_dict["pixel_values"] = input_dict["pixel_values"][-one_img_length:, ...]
input_dict["image_grid_thw"] = input_dict["image_grid_thw"][-1:, ...]
curr_input_dict["pixel_values"] = curr_input_dict["pixel_values"][-one_img_length:, ...]
curr_input_dict["image_grid_thw"] = curr_input_dict["image_grid_thw"][-1:, ...]
with self.assertRaises(ValueError):
_ = model(**input_dict)
_ = model(**curr_input_dict)
# simulate multi-image case by concatenating inputs where each has exactly one image/image-token
input_ids = input_dict["input_ids"][:1]
pixel_values = input_dict["pixel_values"][:one_img_length]
image_grid_thw = input_dict["image_grid_thw"][:1]
input_ids = curr_input_dict["input_ids"][:1]
pixel_values = curr_input_dict["pixel_values"][:one_img_length]
image_grid_thw = curr_input_dict["image_grid_thw"][:1]
input_ids = torch.cat([input_ids, input_ids], dim=0)
# one image and two image tokens raise an error
@@ -262,11 +267,11 @@ class Qwen2VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCas
model = model_class(config).to(torch_device)
# Generate and make sure rope_deltas are not `None`
self.assertTrue(model.rope_deltas is None)
self.assertTrue(model.model.rope_deltas is None)
generation_output = model.generate(
**input_dict, max_new_tokens=4, return_dict_in_generate=True, output_logits=True
)
self.assertTrue(model.rope_deltas is not None)
self.assertTrue(model.model.rope_deltas is not None)
# Now if we try to do forward pass, we should get new rope logits, because cache is not passed
forward_output = model(**input_dict)
@@ -320,6 +325,29 @@ class Qwen2VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCas
def test_save_load_fast_init_from_base(self):
pass
# The multimodal base model embeds will not match ids, due to pixel values. We can't change base test
# because in some models `pixel_values` are required. Will be fixed when we add support for merging `embeds+pixels`
# TODO: @raushan
def test_inputs_embeds_matches_input_ids(self):
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config)
model.to(torch_device)
model.eval()
inputs = self._prepare_for_class(inputs_dict, model_class)
input_ids = inputs["input_ids"]
del inputs["input_ids"]
del inputs["pixel_values"]
inputs_embeds = model.get_input_embeddings()(input_ids)
with torch.no_grad():
out_ids = model(input_ids=input_ids, **inputs)[0]
out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
torch.testing.assert_close(out_embeds, out_ids)
@require_torch
class Qwen2VLIntegrationTest(unittest.TestCase):

View File

@@ -13,6 +13,7 @@
# limitations under the License.
"""Testing suite for the PyTorch VideoLlava model."""
import copy
import unittest
import numpy as np
@@ -23,6 +24,7 @@ from parameterized import parameterized
from transformers import (
VideoLlavaConfig,
VideoLlavaForConditionalGeneration,
VideoLlavaModel,
VideoLlavaProcessor,
is_torch_available,
is_vision_available,
@@ -190,7 +192,14 @@ class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe
Model tester for `VideoLlavaForConditionalGeneration`.
"""
all_model_classes = (VideoLlavaForConditionalGeneration,) if is_torch_available() else ()
all_model_classes = (
(
VideoLlavaModel,
VideoLlavaForConditionalGeneration,
)
if is_torch_available()
else ()
)
fx_compatible = False
test_pruning = False
test_resize_embeddings = True
@@ -235,46 +244,49 @@ class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe
def test_mixed_input(self):
config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
curr_inputs = copy.deepcopy(inputs)
model = model_class(config).to(torch_device).eval()
# test that the forward does not fail
with torch.no_grad():
_ = model(**inputs)
_ = model(**curr_inputs)
# if we remove some images from inputs leaving only one
# image number mismatch error should raise
inputs["pixel_values_images"] = inputs["pixel_values_images"][:1]
curr_inputs["pixel_values_images"] = curr_inputs["pixel_values_images"][:1]
with self.assertRaises(ValueError):
_ = model(**inputs)
_ = model(**curr_inputs)
def test_video_only_input(self):
config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
curr_inputs = copy.deepcopy(inputs)
model = model_class(config).to(torch_device).eval()
# replace image token id with dummy id
# Error will be raised as num-image-tokens and num-of-image-embeds mismatch
inputs["input_ids"][:, : self.model_tester.num_image_tokens] = 2
curr_inputs["input_ids"][:, : self.model_tester.num_image_tokens] = 2
with self.assertRaises(ValueError):
_ = model(**inputs)
_ = model(**curr_inputs)
inputs["pixel_values_images"] = None
_ = model(**inputs)
curr_inputs["pixel_values_images"] = None
_ = model(**curr_inputs)
def test_image_only_input(self):
config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
curr_inputs = copy.deepcopy(inputs)
model = model_class(config).to(torch_device).eval()
# set dummy id, which is not video token id
# Error will be raised as num-video-tokens and num-of-video-embeds mismatch
inputs["input_ids"][
curr_inputs["input_ids"][
:,
self.model_tester.num_image_tokens : self.model_tester.num_image_tokens
+ self.model_tester.num_video_tokens,
] = 2
with self.assertRaises(ValueError):
_ = model(**inputs)
_ = model(**curr_inputs)
inputs["pixel_values_videos"] = None
_ = model(**inputs)
curr_inputs["pixel_values_videos"] = None
_ = model(**curr_inputs)
def test_batching_equivalence(self):
def recursive_check(batched_object, single_row_object, model_name, key):
@@ -386,16 +398,17 @@ class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config).to(torch_device)
_ = model(**input_dict) # successful forward with no modifications
curr_input_dict = copy.deepcopy(input_dict)
_ = model(**curr_input_dict) # successfull forward with no modifications
# remove one image but leave the image token in text
input_dict["pixel_values_images"] = input_dict["pixel_values_images"][-1:, ...]
curr_input_dict["pixel_values_images"] = curr_input_dict["pixel_values_images"][-1:, ...]
with self.assertRaises(ValueError):
_ = model(**input_dict)
_ = model(**curr_input_dict)
# simulate multi-image case by concatenating inputs where each has exactly one image/image-token
input_ids = input_dict["input_ids"][:1]
pixel_values = input_dict["pixel_values_images"][:1]
input_ids = curr_input_dict["input_ids"][:1]
pixel_values = curr_input_dict["pixel_values_images"][:1]
input_ids = torch.cat([input_ids, input_ids], dim=0)
# one image and two image tokens raise an error
@@ -429,7 +442,8 @@ class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe
model = model_class(config).to(torch_device)
# We should have the right number of input features,
# and should be able to run a forward pass without exploding
assert model.multi_modal_projector.linear_1.in_features == expected_features
base_model = getattr(model, "model", model)
assert base_model.multi_modal_projector.linear_1.in_features == expected_features
model(**input_dict)

View File

@@ -13,6 +13,7 @@
# limitations under the License.
"""Testing suite for the PyTorch VipLlava model."""
import copy
import unittest
import requests
@@ -22,6 +23,7 @@ from transformers import (
AutoProcessor,
VipLlavaConfig,
VipLlavaForConditionalGeneration,
VipLlavaModel,
is_torch_available,
is_vision_available,
)
@@ -165,7 +167,14 @@ class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTest
Model tester for `VipLlavaForConditionalGeneration`.
"""
all_model_classes = (VipLlavaForConditionalGeneration,) if is_torch_available() else ()
all_model_classes = (
(
VipLlavaModel,
VipLlavaForConditionalGeneration,
)
if is_torch_available()
else ()
)
pipeline_model_mapping = {"image-text-to-text": VipLlavaForConditionalGeneration} if is_torch_available() else {}
fx_compatible = False
test_pruning = False
@@ -236,16 +245,17 @@ class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTest
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
model = model_class(config).to(torch_device)
_ = model(**input_dict) # successful forward with no modifications
curr_input_dict = copy.deepcopy(input_dict) # in=place modifications further
_ = model(**curr_input_dict) # successful forward with no modifications
# remove one image but leave the image token in text
input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...]
curr_input_dict["pixel_values"] = curr_input_dict["pixel_values"][-1:, ...]
with self.assertRaises(ValueError):
_ = model(**input_dict)
_ = model(**curr_input_dict)
# simulate multi-image case by concatenating inputs where each has exactly one image/image-token
input_ids = input_dict["input_ids"][:1]
pixel_values = input_dict["pixel_values"][:1]
input_ids = curr_input_dict["input_ids"][:1]
pixel_values = curr_input_dict["pixel_values"][:1]
input_ids = torch.cat([input_ids, input_ids], dim=0)
# one image and two image tokens raise an error
@@ -284,7 +294,8 @@ class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTest
model = model_class(config).to(torch_device)
# We should have the right number of input features,
# and should be able to run a forward pass without exploding
assert model.multi_modal_projector.linear_1.in_features == expected_features
base_model = getattr(model, "model", model)
assert base_model.multi_modal_projector.linear_1.in_features == expected_features
model(**input_dict)
@unittest.skip(
@@ -311,6 +322,10 @@ class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTest
def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
pass
@unittest.skip("LLaVA vision backbones doesn't support flex attention yet")
def test_flex_attention_with_grads(self):
pass
@require_torch
class VipLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):

View File

@@ -3494,8 +3494,8 @@ class ModelTesterMixin:
vision_model_name = [name for name in vision_model_names if hasattr(model_sdpa, name)][0]
language_model_name = [name for name in language_model_names if hasattr(model_sdpa, name)][0]
vision_model_sdpa = getattr(model, vision_model_name)
language_model_sdpa = getattr(model, language_model_name)
vision_model_sdpa = getattr(model_sdpa, vision_model_name)
language_model_sdpa = getattr(model_sdpa, language_model_name)
text_attn = "sdpa" if language_model_sdpa._supports_sdpa else "eager"
vision_attn = "sdpa" if vision_model_sdpa._supports_sdpa else "eager"
@@ -4489,7 +4489,8 @@ class ModelTesterMixin:
@require_torch_gpu
def test_flex_attention_with_grads(self):
for model_class in self.all_model_classes:
if not model_class._supports_flex_attn:
# TODO: raushan, fix for composite models after making VLMs support new attn API
if not model_class._supports_flex_attn or self._is_composite:
self.skipTest(reason="This model does not support flex attention")
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
config._attn_implementation = "flex_attention"