🔴 [VLM] Add base model without head (#37033)
* i guessreverted all CdGen classes * style * llava onevision * fix copies * fix some tests * some more tests * dump * skip these * nevermind, i am dumb * revert fix not needed * fixup * fixup * another fixup * more fixup to make ci finally happy * fixup after rebasing * fix qwen tests * add internVL + typos here and there * image token index -> id * style * fix init weights * revert blip-2 not supported * address comments * fix copies * revert blip2 test file as well * as discussed internally, revert back CdGen models * fix some tests * fix more tests for compile * CI red * fix copies * enumerate explicitly allowed models * address comments * fix tests * fixup * style again * add tests for new model class * another fixup ( x _ x ) * [fixup] unused attributes can be removed post-deprecation
This commit is contained in:
committed by
GitHub
parent
3fa8d9c20e
commit
17742bd9c8
@@ -21,6 +21,7 @@ import requests
|
||||
from transformers import (
|
||||
AriaConfig,
|
||||
AriaForConditionalGeneration,
|
||||
AriaModel,
|
||||
AriaTextConfig,
|
||||
AutoProcessor,
|
||||
AutoTokenizer,
|
||||
@@ -175,7 +176,7 @@ class AriaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMi
|
||||
Model tester for `AriaForConditionalGeneration`.
|
||||
"""
|
||||
|
||||
all_model_classes = (AriaForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_model_classes = (AriaModel, AriaForConditionalGeneration) if is_torch_available() else ()
|
||||
test_pruning = False
|
||||
test_head_masking = False
|
||||
_is_composite = True
|
||||
@@ -281,6 +282,18 @@ class AriaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterMi
|
||||
def test_generate_from_inputs_embeds_with_static_cache(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="Aria uses nn.MHA which is not compatible with offloading")
|
||||
def test_cpu_offload(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="Aria uses nn.MHA which is not compatible with offloading")
|
||||
def test_disk_offload_bin(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="Aria uses nn.MHA which is not compatible with offloading")
|
||||
def test_disk_offload_safetensors(self):
|
||||
pass
|
||||
|
||||
|
||||
@require_torch
|
||||
class AriaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
|
||||
@@ -46,6 +46,7 @@ if is_torch_available():
|
||||
|
||||
from transformers import (
|
||||
AyaVisionForConditionalGeneration,
|
||||
AyaVisionModel,
|
||||
)
|
||||
|
||||
|
||||
@@ -158,7 +159,14 @@ class AyaVisionVisionText2TextModelTester:
|
||||
|
||||
@require_torch
|
||||
class AyaVisionModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (AyaVisionForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_model_classes = (
|
||||
(
|
||||
AyaVisionModel,
|
||||
AyaVisionForConditionalGeneration,
|
||||
)
|
||||
if is_torch_available()
|
||||
else ()
|
||||
)
|
||||
all_generative_model_classes = (AyaVisionForConditionalGeneration,) if is_torch_available() else ()
|
||||
pipeline_model_mapping = (
|
||||
{
|
||||
|
||||
@@ -46,6 +46,7 @@ if is_torch_available():
|
||||
from transformers import (
|
||||
Emu3ForCausalLM,
|
||||
Emu3ForConditionalGeneration,
|
||||
Emu3Model,
|
||||
Emu3Processor,
|
||||
Emu3TextModel,
|
||||
)
|
||||
@@ -310,7 +311,14 @@ class Emu3Vision2TextModelTester:
|
||||
|
||||
@require_torch
|
||||
class Emu3Vision2TextModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (Emu3ForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_model_classes = (
|
||||
(
|
||||
Emu3Model,
|
||||
Emu3ForConditionalGeneration,
|
||||
)
|
||||
if is_torch_available()
|
||||
else ()
|
||||
)
|
||||
pipeline_model_mapping = {}
|
||||
test_headmasking = False
|
||||
test_pruning = False
|
||||
@@ -395,6 +403,10 @@ class Emu3Vision2TextModelTest(ModelTesterMixin, GenerationTesterMixin, Pipeline
|
||||
def test_generate_with_static_cache(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("Emu3 doesn't support Flex attn yet!")
|
||||
def test_flex_attention_with_grads(self):
|
||||
pass
|
||||
|
||||
|
||||
@require_torch
|
||||
class Emu3IntegrationTest(unittest.TestCase):
|
||||
|
||||
@@ -38,7 +38,7 @@ if is_torch_available() and is_vision_available():
|
||||
|
||||
|
||||
if is_torch_available():
|
||||
from transformers import FuyuForCausalLM
|
||||
from transformers import FuyuForCausalLM, FuyuModel
|
||||
|
||||
|
||||
class FuyuModelTester:
|
||||
@@ -145,7 +145,14 @@ class FuyuModelTester:
|
||||
|
||||
@require_torch
|
||||
class FuyuModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (FuyuForCausalLM,) if is_torch_available() else ()
|
||||
all_model_classes = (
|
||||
(
|
||||
FuyuModel,
|
||||
FuyuForCausalLM,
|
||||
)
|
||||
if is_torch_available()
|
||||
else ()
|
||||
)
|
||||
pipeline_model_mapping = (
|
||||
{"text-generation": FuyuForCausalLM, "image-text-to-text": FuyuForCausalLM} if is_torch_available() else {}
|
||||
)
|
||||
|
||||
@@ -50,6 +50,7 @@ if is_torch_available():
|
||||
from transformers import (
|
||||
Gemma3ForCausalLM,
|
||||
Gemma3ForConditionalGeneration,
|
||||
Gemma3Model,
|
||||
Gemma3Processor,
|
||||
Gemma3TextModel,
|
||||
)
|
||||
@@ -148,9 +149,9 @@ class Gemma3Vision2TextModelTester:
|
||||
self,
|
||||
parent,
|
||||
mm_tokens_per_image=2,
|
||||
image_token_index=1,
|
||||
boi_token_index=2,
|
||||
eoi_token_index=3,
|
||||
image_token_index=4,
|
||||
boi_token_index=5,
|
||||
eoi_token_index=6,
|
||||
seq_length=25,
|
||||
is_training=True,
|
||||
vision_config={
|
||||
@@ -242,7 +243,14 @@ class Gemma3Vision2TextModelTester:
|
||||
|
||||
@require_torch
|
||||
class Gemma3Vision2TextModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (Gemma3ForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_model_classes = (
|
||||
(
|
||||
Gemma3Model,
|
||||
Gemma3ForConditionalGeneration,
|
||||
)
|
||||
if is_torch_available()
|
||||
else ()
|
||||
)
|
||||
all_generative_model_classes = (Gemma3ForConditionalGeneration,) if is_torch_available() else ()
|
||||
test_headmasking = False
|
||||
test_pruning = False
|
||||
|
||||
@@ -34,6 +34,7 @@ if is_torch_available():
|
||||
|
||||
from transformers import (
|
||||
GotOcr2ForConditionalGeneration,
|
||||
GotOcr2Model,
|
||||
)
|
||||
|
||||
|
||||
@@ -140,7 +141,14 @@ class GotOcr2VisionText2TextModelTester:
|
||||
|
||||
@require_torch
|
||||
class GotOcr2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (GotOcr2ForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_model_classes = (
|
||||
(
|
||||
GotOcr2Model,
|
||||
GotOcr2ForConditionalGeneration,
|
||||
)
|
||||
if is_torch_available()
|
||||
else ()
|
||||
)
|
||||
pipeline_model_mapping = (
|
||||
{
|
||||
"image-to-text": GotOcr2ForConditionalGeneration,
|
||||
@@ -228,6 +236,10 @@ class GotOcr2ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
|
||||
def test_past_key_values_format(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="Vision backbone doesn't support FLEX yet!")
|
||||
def test_flex_attention_with_grads(self):
|
||||
pass
|
||||
|
||||
|
||||
@require_torch
|
||||
class GotOcr2IntegrationTest(unittest.TestCase):
|
||||
|
||||
@@ -54,7 +54,7 @@ if is_torch_available():
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from transformers import InstructBlipForConditionalGeneration, InstructBlipVisionModel
|
||||
from transformers import InstructBlipForConditionalGeneration, InstructBlipModel, InstructBlipVisionModel
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
@@ -460,14 +460,20 @@ class InstructBlipForConditionalGenerationDecoderOnlyModelTester:
|
||||
"attention_mask": attention_mask,
|
||||
"qformer_input_ids": qformer_input_ids,
|
||||
"qformer_attention_mask": qformer_attention_mask,
|
||||
"labels": input_ids,
|
||||
}
|
||||
return config, inputs_dict
|
||||
|
||||
|
||||
@require_torch
|
||||
class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (InstructBlipForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_model_classes = (
|
||||
(
|
||||
InstructBlipModel,
|
||||
InstructBlipForConditionalGeneration,
|
||||
)
|
||||
if is_torch_available()
|
||||
else ()
|
||||
)
|
||||
pipeline_model_mapping = {"image-text-to-text": InstructBlipForConditionalGeneration}
|
||||
fx_compatible = False
|
||||
test_head_masking = False
|
||||
|
||||
@@ -54,7 +54,11 @@ if is_torch_available():
|
||||
import torch
|
||||
from torch import nn
|
||||
|
||||
from transformers import InstructBlipVideoForConditionalGeneration, InstructBlipVideoVisionModel
|
||||
from transformers import (
|
||||
InstructBlipVideoForConditionalGeneration,
|
||||
InstructBlipVideoModel,
|
||||
InstructBlipVideoVisionModel,
|
||||
)
|
||||
|
||||
|
||||
class InstructBlipVideoVisionModelTester:
|
||||
@@ -477,7 +481,6 @@ class InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester:
|
||||
"attention_mask": attention_mask,
|
||||
"qformer_input_ids": qformer_input_ids,
|
||||
"qformer_attention_mask": qformer_attention_mask,
|
||||
"labels": input_ids,
|
||||
}
|
||||
return config, inputs_dict
|
||||
|
||||
@@ -486,7 +489,9 @@ class InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester:
|
||||
class InstructBlipVideoForConditionalGenerationDecoderOnlyTest(
|
||||
ModelTesterMixin, GenerationTesterMixin, unittest.TestCase
|
||||
):
|
||||
all_model_classes = (InstructBlipVideoForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_model_classes = (
|
||||
(InstructBlipVideoForConditionalGeneration, InstructBlipVideoModel) if is_torch_available() else ()
|
||||
)
|
||||
fx_compatible = False
|
||||
test_head_masking = False
|
||||
test_pruning = False
|
||||
|
||||
@@ -47,9 +47,7 @@ from ...test_pipeline_mixin import PipelineTesterMixin
|
||||
if is_torch_available():
|
||||
import torch
|
||||
|
||||
from transformers import (
|
||||
InternVLForConditionalGeneration,
|
||||
)
|
||||
from transformers import InternVLForConditionalGeneration, InternVLModel
|
||||
|
||||
|
||||
if is_vision_available():
|
||||
@@ -191,7 +189,7 @@ class InternVLVisionText2TextModelTester:
|
||||
|
||||
@require_torch
|
||||
class InternVLModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (InternVLForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_model_classes = (InternVLForConditionalGeneration, InternVLModel) if is_torch_available() else ()
|
||||
all_generative_model_classes = (InternVLForConditionalGeneration,) if is_torch_available() else ()
|
||||
pipeline_model_mapping = (
|
||||
{
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
# limitations under the License.
|
||||
"""Testing suite for the PyTorch Llava model."""
|
||||
|
||||
import copy
|
||||
import unittest
|
||||
|
||||
import requests
|
||||
@@ -23,6 +24,7 @@ from transformers import (
|
||||
AutoTokenizer,
|
||||
LlavaConfig,
|
||||
LlavaForConditionalGeneration,
|
||||
LlavaModel,
|
||||
is_torch_available,
|
||||
is_vision_available,
|
||||
)
|
||||
@@ -166,7 +168,14 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterM
|
||||
Model tester for `LlavaForConditionalGeneration`.
|
||||
"""
|
||||
|
||||
all_model_classes = (LlavaForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_model_classes = (
|
||||
(
|
||||
LlavaModel,
|
||||
LlavaForConditionalGeneration,
|
||||
)
|
||||
if is_torch_available()
|
||||
else ()
|
||||
)
|
||||
pipeline_model_mapping = (
|
||||
{"image-to-text": LlavaForConditionalGeneration, "image-text-to-text": LlavaForConditionalGeneration}
|
||||
if is_torch_available()
|
||||
@@ -238,16 +247,17 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterM
|
||||
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config).to(torch_device)
|
||||
_ = model(**input_dict) # successful forward with no modifications
|
||||
curr_input_dict = copy.deepcopy(input_dict) # in=place modifications further
|
||||
_ = model(**curr_input_dict) # successful forward with no modifications
|
||||
|
||||
# remove one image but leave the image token in text
|
||||
input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...]
|
||||
curr_input_dict["pixel_values"] = curr_input_dict["pixel_values"][-1:, ...]
|
||||
with self.assertRaises(ValueError):
|
||||
_ = model(**input_dict)
|
||||
_ = model(**curr_input_dict)
|
||||
|
||||
# simulate multi-image case by concatenating inputs where each has exactly one image/image-token
|
||||
input_ids = input_dict["input_ids"][:1]
|
||||
pixel_values = input_dict["pixel_values"][:1]
|
||||
input_ids = curr_input_dict["input_ids"][:1]
|
||||
pixel_values = curr_input_dict["pixel_values"][:1]
|
||||
input_ids = torch.cat([input_ids, input_ids], dim=0)
|
||||
|
||||
# one image and two image tokens raise an error
|
||||
@@ -281,7 +291,8 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterM
|
||||
model = model_class(config).to(torch_device)
|
||||
# We should have the right number of input features,
|
||||
# and should be able to run a forward pass without exploding
|
||||
assert model.multi_modal_projector.linear_1.in_features == expected_features
|
||||
base_model = getattr(model, "model", model)
|
||||
assert base_model.multi_modal_projector.linear_1.in_features == expected_features
|
||||
model(**input_dict)
|
||||
|
||||
@unittest.skip(
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
# limitations under the License.
|
||||
"""Testing suite for the PyTorch Llava-NeXT model."""
|
||||
|
||||
import copy
|
||||
import unittest
|
||||
|
||||
import requests
|
||||
@@ -23,6 +24,7 @@ from transformers import (
|
||||
AutoProcessor,
|
||||
LlavaNextConfig,
|
||||
LlavaNextForConditionalGeneration,
|
||||
LlavaNextModel,
|
||||
is_torch_available,
|
||||
is_vision_available,
|
||||
)
|
||||
@@ -181,7 +183,14 @@ class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
|
||||
Model tester for `LlavaNextForConditionalGeneration`.
|
||||
"""
|
||||
|
||||
all_model_classes = (LlavaNextForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_model_classes = (
|
||||
(
|
||||
LlavaNextModel,
|
||||
LlavaNextForConditionalGeneration,
|
||||
)
|
||||
if is_torch_available()
|
||||
else ()
|
||||
)
|
||||
pipeline_model_mapping = {"image-text-to-text": LlavaNextForConditionalGeneration} if is_torch_available() else {}
|
||||
test_pruning = False
|
||||
test_head_masking = False
|
||||
@@ -265,18 +274,19 @@ class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
|
||||
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config).to(torch_device)
|
||||
_ = model(**input_dict) # successful forward with no modifications
|
||||
curr_input_dict = copy.deepcopy(input_dict) # in=place modifications further
|
||||
_ = model(**curr_input_dict) # successful forward with no modifications
|
||||
|
||||
# remove one image but leave the image token in text
|
||||
input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...]
|
||||
input_dict["image_sizes"] = input_dict["image_sizes"][-1:, ...]
|
||||
curr_input_dict["pixel_values"] = curr_input_dict["pixel_values"][-1:, ...]
|
||||
curr_input_dict["image_sizes"] = curr_input_dict["image_sizes"][-1:, ...]
|
||||
with self.assertRaises(ValueError):
|
||||
_ = model(**input_dict)
|
||||
_ = model(**curr_input_dict)
|
||||
|
||||
# simulate multi-image case by concatenating inputs where each has exactly one image/image-token
|
||||
input_ids = input_dict["input_ids"][:1]
|
||||
pixel_values = input_dict["pixel_values"][:1]
|
||||
image_sizes = input_dict["image_sizes"][:1]
|
||||
input_ids = curr_input_dict["input_ids"][:1]
|
||||
pixel_values = curr_input_dict["pixel_values"][:1]
|
||||
image_sizes = curr_input_dict["image_sizes"][:1]
|
||||
input_ids = torch.cat([input_ids, input_ids], dim=0)
|
||||
|
||||
# one image and two image tokens raise an error
|
||||
@@ -324,7 +334,8 @@ class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
|
||||
model = model_class(config).to(torch_device)
|
||||
# We should have the right number of input features,
|
||||
# and should be able to run a forward pass without exploding
|
||||
assert model.multi_modal_projector.linear_1.in_features == expected_features
|
||||
base_model = getattr(model, "model", model)
|
||||
assert base_model.multi_modal_projector.linear_1.in_features == expected_features
|
||||
model(**input_dict)
|
||||
|
||||
@unittest.skip(
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
# limitations under the License.
|
||||
"""Testing suite for the PyTorch Llava-NeXT-Video model."""
|
||||
|
||||
import copy
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
@@ -23,6 +24,7 @@ from transformers import (
|
||||
AutoProcessor,
|
||||
LlavaNextVideoConfig,
|
||||
LlavaNextVideoForConditionalGeneration,
|
||||
LlavaNextVideoModel,
|
||||
is_torch_available,
|
||||
is_vision_available,
|
||||
)
|
||||
@@ -196,7 +198,14 @@ class LlavaNextVideoForConditionalGenerationModelTest(ModelTesterMixin, Generati
|
||||
Model tester for `LlavaNextVideoForConditionalGeneration`.
|
||||
"""
|
||||
|
||||
all_model_classes = (LlavaNextVideoForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_model_classes = (
|
||||
(
|
||||
LlavaNextVideoModel,
|
||||
LlavaNextVideoForConditionalGeneration,
|
||||
)
|
||||
if is_torch_available()
|
||||
else ()
|
||||
)
|
||||
test_pruning = False
|
||||
test_head_masking = False
|
||||
_is_composite = True
|
||||
@@ -281,18 +290,19 @@ class LlavaNextVideoForConditionalGenerationModelTest(ModelTesterMixin, Generati
|
||||
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config).to(torch_device)
|
||||
_ = model(**input_dict) # successful forward with no modifications
|
||||
curr_input_dict = copy.deepcopy(input_dict) # in=place modifications further
|
||||
_ = model(**curr_input_dict) # successful forward with no modifications
|
||||
|
||||
# remove one image but leave the image token in text
|
||||
input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...]
|
||||
input_dict["image_sizes"] = input_dict["image_sizes"][-1:, ...]
|
||||
curr_input_dict["pixel_values"] = curr_input_dict["pixel_values"][-1:, ...]
|
||||
curr_input_dict["image_sizes"] = curr_input_dict["image_sizes"][-1:, ...]
|
||||
with self.assertRaises(ValueError):
|
||||
_ = model(**input_dict)
|
||||
_ = model(**curr_input_dict)
|
||||
|
||||
# simulate multi-image case by concatenating inputs where each has exactly one image/image-token
|
||||
input_ids = input_dict["input_ids"][:1]
|
||||
pixel_values = input_dict["pixel_values"][:1]
|
||||
image_sizes = input_dict["image_sizes"][:1]
|
||||
input_ids = curr_input_dict["input_ids"][:1]
|
||||
pixel_values = curr_input_dict["pixel_values"][:1]
|
||||
image_sizes = curr_input_dict["image_sizes"][:1]
|
||||
input_ids = torch.cat([input_ids, input_ids], dim=0)
|
||||
|
||||
# one image and two image tokens raise an error
|
||||
@@ -340,7 +350,8 @@ class LlavaNextVideoForConditionalGenerationModelTest(ModelTesterMixin, Generati
|
||||
model = model_class(config).to(torch_device)
|
||||
# We should have the right number of input features,
|
||||
# and should be able to run a forward pass without exploding
|
||||
assert model.multi_modal_projector.linear_1.in_features == expected_features
|
||||
base_model = getattr(model, "model", model)
|
||||
assert base_model.multi_modal_projector.linear_1.in_features == expected_features
|
||||
model(**input_dict)
|
||||
|
||||
@unittest.skip(
|
||||
|
||||
@@ -24,6 +24,7 @@ from transformers import (
|
||||
AutoProcessor,
|
||||
LlavaOnevisionConfig,
|
||||
LlavaOnevisionForConditionalGeneration,
|
||||
LlavaOnevisionModel,
|
||||
is_torch_available,
|
||||
is_vision_available,
|
||||
)
|
||||
@@ -182,7 +183,14 @@ class LlavaOnevisionForConditionalGenerationModelTest(ModelTesterMixin, Generati
|
||||
Model tester for `LlavaOnevisionForConditionalGeneration`.
|
||||
"""
|
||||
|
||||
all_model_classes = (LlavaOnevisionForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_model_classes = (
|
||||
(
|
||||
LlavaOnevisionModel,
|
||||
LlavaOnevisionForConditionalGeneration,
|
||||
)
|
||||
if is_torch_available()
|
||||
else ()
|
||||
)
|
||||
pipeline_model_mapping = (
|
||||
{"image-text-to-text": LlavaOnevisionForConditionalGeneration} if is_torch_available() else {}
|
||||
)
|
||||
@@ -296,7 +304,8 @@ class LlavaOnevisionForConditionalGenerationModelTest(ModelTesterMixin, Generati
|
||||
model = model_class(config).to(torch_device)
|
||||
# We should have the right number of input features,
|
||||
# and should be able to run a forward pass without exploding
|
||||
assert model.multi_modal_projector.linear_1.in_features == expected_features
|
||||
base_model = getattr(model, "model", model)
|
||||
assert base_model.multi_modal_projector.linear_1.in_features == expected_features
|
||||
model(**input_dict)
|
||||
|
||||
@unittest.skip(
|
||||
|
||||
@@ -42,6 +42,7 @@ if is_torch_available():
|
||||
|
||||
from transformers import (
|
||||
Mistral3ForConditionalGeneration,
|
||||
Mistral3Model,
|
||||
)
|
||||
|
||||
|
||||
@@ -162,7 +163,14 @@ class Mistral3VisionText2TextModelTester:
|
||||
|
||||
@require_torch
|
||||
class Mistral3ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin, unittest.TestCase):
|
||||
all_model_classes = (Mistral3ForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_model_classes = (
|
||||
(
|
||||
Mistral3Model,
|
||||
Mistral3ForConditionalGeneration,
|
||||
)
|
||||
if is_torch_available()
|
||||
else ()
|
||||
)
|
||||
all_generative_model_classes = (Mistral3ForConditionalGeneration,) if is_torch_available() else ()
|
||||
pipeline_model_mapping = (
|
||||
{
|
||||
@@ -278,6 +286,10 @@ class Mistral3ModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterM
|
||||
def test_sdpa_can_dispatch_on_flash(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("Pixtral does not support attention interfaces.")
|
||||
def test_flex_attention_with_grads(self):
|
||||
pass
|
||||
|
||||
|
||||
@slow
|
||||
@require_torch_gpu
|
||||
|
||||
@@ -25,6 +25,7 @@ from transformers import (
|
||||
MllamaConfig,
|
||||
MllamaForCausalLM,
|
||||
MllamaForConditionalGeneration,
|
||||
MllamaModel,
|
||||
is_torch_available,
|
||||
is_vision_available,
|
||||
)
|
||||
@@ -262,7 +263,14 @@ class MllamaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTester
|
||||
Model tester for `MllamaForConditionalGeneration`.
|
||||
"""
|
||||
|
||||
all_model_classes = (MllamaForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_model_classes = (
|
||||
(
|
||||
MllamaModel,
|
||||
MllamaForConditionalGeneration,
|
||||
)
|
||||
if is_torch_available()
|
||||
else ()
|
||||
)
|
||||
pipeline_model_mapping = {"image-text-to-text": MllamaForConditionalGeneration} if is_torch_available() else ()
|
||||
test_pruning = False
|
||||
test_head_masking = False
|
||||
@@ -325,19 +333,18 @@ class MllamaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTester
|
||||
# resizing embeddings should result in successful loss computation
|
||||
config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
model_vocab_size = config.get_text_config().vocab_size
|
||||
inputs = self._prepare_for_class(inputs, model_class, return_labels=True)
|
||||
# Resize embeddings and call forward
|
||||
model.resize_token_embeddings(model_vocab_size + 10)
|
||||
output = model(
|
||||
input_ids=inputs["input_ids"],
|
||||
attention_mask=inputs["attention_mask"],
|
||||
labels=inputs["labels"],
|
||||
return_dict=True,
|
||||
)
|
||||
self.assertTrue("loss" in output)
|
||||
model = MllamaForConditionalGeneration(config).to(torch_device)
|
||||
model_vocab_size = config.get_text_config().vocab_size
|
||||
inputs = self._prepare_for_class(inputs, MllamaForConditionalGeneration, return_labels=True)
|
||||
# Resize embeddings and call forward
|
||||
model.resize_token_embeddings(model_vocab_size + 10)
|
||||
output = model(
|
||||
input_ids=inputs["input_ids"],
|
||||
attention_mask=inputs["attention_mask"],
|
||||
labels=inputs["labels"],
|
||||
return_dict=True,
|
||||
)
|
||||
self.assertTrue("loss" in output)
|
||||
|
||||
def _check_attentions_for_generate(
|
||||
self, batch_size, attentions, prompt_length, output_length, config, decoder_past_key_values
|
||||
@@ -409,6 +416,18 @@ class MllamaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTester
|
||||
def test_assisted_decoding_with_num_logits_to_keep(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="Mllama uses self.weights dirrectly causing device mismatch when offloading`")
|
||||
def test_cpu_offload(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="Mllama uses self.weights dirrectly causing device mismatch when offloading`")
|
||||
def test_disk_offload_bin(self):
|
||||
pass
|
||||
|
||||
@unittest.skip(reason="Mllama uses self.weights dirrectly causing device mismatch when offloading`")
|
||||
def test_disk_offload_safetensors(self):
|
||||
pass
|
||||
|
||||
@pytest.mark.generate
|
||||
# overridden because mllama is not an encoder-decoder model, but has encoder-decoder-like cache
|
||||
def test_past_key_values_format(self):
|
||||
@@ -501,7 +520,7 @@ class MllamaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTester
|
||||
"""
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
for model_class in self.all_generative_model_classes:
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
# limitations under the License.
|
||||
"""Testing suite for the PyTorch PaliGemma model."""
|
||||
|
||||
import copy
|
||||
import unittest
|
||||
|
||||
import requests
|
||||
@@ -20,6 +21,7 @@ import requests
|
||||
from transformers import (
|
||||
PaliGemmaConfig,
|
||||
PaliGemmaForConditionalGeneration,
|
||||
PaliGemmaModel,
|
||||
PaliGemmaProcessor,
|
||||
is_torch_available,
|
||||
is_vision_available,
|
||||
@@ -177,7 +179,14 @@ class PaliGemmaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
|
||||
Model tester for `PaliGemmaForConditionalGeneration`.
|
||||
"""
|
||||
|
||||
all_model_classes = (PaliGemmaForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_model_classes = (
|
||||
(
|
||||
PaliGemmaModel,
|
||||
PaliGemmaForConditionalGeneration,
|
||||
)
|
||||
if is_torch_available()
|
||||
else ()
|
||||
)
|
||||
pipeline_model_mapping = {"image-text-to-text": PaliGemmaForConditionalGeneration}
|
||||
fx_compatible = False
|
||||
test_pruning = False
|
||||
@@ -242,16 +251,17 @@ class PaliGemmaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
|
||||
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config).to(torch_device)
|
||||
_ = model(**input_dict) # successful forward with no modifications
|
||||
curr_input_dict = copy.deepcopy(input_dict) # in=place modifications further
|
||||
_ = model(**curr_input_dict) # successful forward with no modifications
|
||||
|
||||
# remove one image but leave the image token in text
|
||||
input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...]
|
||||
curr_input_dict["pixel_values"] = curr_input_dict["pixel_values"][-1:, ...]
|
||||
with self.assertRaises(ValueError):
|
||||
_ = model(**input_dict)
|
||||
_ = model(**curr_input_dict)
|
||||
|
||||
# simulate multi-image case by concatenating inputs where each has exactly one image/image-token
|
||||
input_ids = input_dict["input_ids"][:1]
|
||||
pixel_values = input_dict["pixel_values"][:1]
|
||||
input_ids = curr_input_dict["input_ids"][:1]
|
||||
pixel_values = curr_input_dict["pixel_values"][:1]
|
||||
input_ids = torch.cat([input_ids, input_ids], dim=0)
|
||||
|
||||
# one image and two image tokens raise an error
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
# limitations under the License.
|
||||
"""Testing suite for the PyTorch PaliGemma model."""
|
||||
|
||||
import copy
|
||||
import unittest
|
||||
|
||||
import pytest
|
||||
@@ -239,16 +240,17 @@ class PaliGemma2ForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe
|
||||
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config).to(torch_device)
|
||||
_ = model(**input_dict) # successful forward with no modifications
|
||||
curr_input_dict = copy.deepcopy(input_dict) # in=place modifications further
|
||||
_ = model(**curr_input_dict) # successful forward with no modifications
|
||||
|
||||
# remove one image but leave the image token in text
|
||||
input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...]
|
||||
curr_input_dict["pixel_values"] = curr_input_dict["pixel_values"][-1:, ...]
|
||||
with self.assertRaises(ValueError):
|
||||
_ = model(**input_dict)
|
||||
_ = model(**curr_input_dict)
|
||||
|
||||
# simulate multi-image case by concatenating inputs where each has exactly one image/image-token
|
||||
input_ids = input_dict["input_ids"][:1]
|
||||
pixel_values = input_dict["pixel_values"][:1]
|
||||
input_ids = curr_input_dict["input_ids"][:1]
|
||||
pixel_values = curr_input_dict["pixel_values"][:1]
|
||||
input_ids = torch.cat([input_ids, input_ids], dim=0)
|
||||
|
||||
# one image and two image tokens raise an error
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
# limitations under the License.
|
||||
"""Testing suite for the PyTorch Qwen2.5-VL model."""
|
||||
|
||||
import copy
|
||||
import gc
|
||||
import tempfile
|
||||
import unittest
|
||||
@@ -23,6 +24,7 @@ from transformers import (
|
||||
AutoProcessor,
|
||||
Qwen2_5_VLConfig,
|
||||
Qwen2_5_VLForConditionalGeneration,
|
||||
Qwen2_5_VLModel,
|
||||
is_torch_available,
|
||||
is_vision_available,
|
||||
)
|
||||
@@ -180,17 +182,11 @@ class Qwen2_5_VLVisionText2TextModelTester:
|
||||
input_ids[input_ids == self.vision_start_token_id] = self.pad_token_id
|
||||
input_ids[:, self.num_image_tokens] = self.image_token_id
|
||||
input_ids[:, self.num_image_tokens - 1] = self.vision_start_token_id
|
||||
labels = torch.zeros(
|
||||
(self.batch_size, self.seq_length),
|
||||
dtype=torch.long,
|
||||
device=torch_device,
|
||||
)
|
||||
inputs_dict = {
|
||||
"pixel_values": pixel_values,
|
||||
"image_grid_thw": torch.tensor([[1, 1, 1]] * self.batch_size),
|
||||
"input_ids": input_ids,
|
||||
"attention_mask": attention_mask,
|
||||
"labels": labels,
|
||||
}
|
||||
return config, inputs_dict
|
||||
|
||||
@@ -201,7 +197,14 @@ class Qwen2_5_VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test
|
||||
Model tester for `Qwen2_5_VLForConditionalGeneration`.
|
||||
"""
|
||||
|
||||
all_model_classes = (Qwen2_5_VLForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_model_classes = (
|
||||
(
|
||||
Qwen2_5_VLModel,
|
||||
Qwen2_5_VLForConditionalGeneration,
|
||||
)
|
||||
if is_torch_available()
|
||||
else ()
|
||||
)
|
||||
test_pruning = False
|
||||
test_head_masking = False
|
||||
|
||||
@@ -236,19 +239,20 @@ class Qwen2_5_VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config).to(torch_device)
|
||||
_ = model(**input_dict) # successful forward with no modifications
|
||||
curr_input_dict = copy.deepcopy(input_dict)
|
||||
|
||||
# remove one image but leave the image token in text
|
||||
patch_size = config.vision_config.patch_size
|
||||
one_img_length = (self.model_tester.image_size**2) // (patch_size**2)
|
||||
input_dict["pixel_values"] = input_dict["pixel_values"][-one_img_length:, ...]
|
||||
input_dict["image_grid_thw"] = input_dict["image_grid_thw"][-1:, ...]
|
||||
curr_input_dict["pixel_values"] = curr_input_dict["pixel_values"][-one_img_length:, ...]
|
||||
curr_input_dict["image_grid_thw"] = curr_input_dict["image_grid_thw"][-1:, ...]
|
||||
with self.assertRaises(ValueError):
|
||||
_ = model(**input_dict)
|
||||
_ = model(**curr_input_dict)
|
||||
|
||||
# simulate multi-image case by concatenating inputs where each has exactly one image/image-token
|
||||
input_ids = input_dict["input_ids"][:1]
|
||||
pixel_values = input_dict["pixel_values"][:one_img_length]
|
||||
image_grid_thw = input_dict["image_grid_thw"][:1]
|
||||
input_ids = curr_input_dict["input_ids"][:1]
|
||||
pixel_values = curr_input_dict["pixel_values"][:one_img_length]
|
||||
image_grid_thw = curr_input_dict["image_grid_thw"][:1]
|
||||
input_ids = torch.cat([input_ids, input_ids], dim=0)
|
||||
|
||||
# one image and two image tokens raise an error
|
||||
@@ -375,6 +379,29 @@ class Qwen2_5_VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test
|
||||
def test_save_load_fast_init_from_base(self):
|
||||
pass
|
||||
|
||||
# The multimodal base model embeds will not match ids, due to pixel values. We can't change base test
|
||||
# because in some models `pixel_values` are required. Will be fixed when we add support for merging `embeds+pixels`
|
||||
# TODO: @raushan
|
||||
def test_inputs_embeds_matches_input_ids(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
inputs = self._prepare_for_class(inputs_dict, model_class)
|
||||
input_ids = inputs["input_ids"]
|
||||
del inputs["input_ids"]
|
||||
del inputs["pixel_values"]
|
||||
|
||||
inputs_embeds = model.get_input_embeddings()(input_ids)
|
||||
|
||||
with torch.no_grad():
|
||||
out_ids = model(input_ids=input_ids, **inputs)[0]
|
||||
out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
|
||||
torch.testing.assert_close(out_embeds, out_ids)
|
||||
|
||||
|
||||
@require_torch
|
||||
class Qwen2_5_VLIntegrationTest(unittest.TestCase):
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
# limitations under the License.
|
||||
"""Testing suite for the PyTorch Qwen2-VL model."""
|
||||
|
||||
import copy
|
||||
import gc
|
||||
import unittest
|
||||
|
||||
@@ -22,6 +23,7 @@ from transformers import (
|
||||
AutoProcessor,
|
||||
Qwen2VLConfig,
|
||||
Qwen2VLForConditionalGeneration,
|
||||
Qwen2VLModel,
|
||||
is_torch_available,
|
||||
is_vision_available,
|
||||
)
|
||||
@@ -169,17 +171,12 @@ class Qwen2VLVisionText2TextModelTester:
|
||||
input_ids[input_ids == self.vision_start_token_id] = self.pad_token_id
|
||||
input_ids[:, self.num_image_tokens] = self.image_token_id
|
||||
input_ids[:, self.num_image_tokens - 1] = self.vision_start_token_id
|
||||
labels = torch.zeros(
|
||||
(self.batch_size, self.seq_length),
|
||||
dtype=torch.long,
|
||||
device=torch_device,
|
||||
)
|
||||
|
||||
inputs_dict = {
|
||||
"pixel_values": pixel_values,
|
||||
"image_grid_thw": torch.tensor([[1, 1, 1]] * self.batch_size),
|
||||
"input_ids": input_ids,
|
||||
"attention_mask": attention_mask,
|
||||
"labels": labels,
|
||||
}
|
||||
return config, inputs_dict
|
||||
|
||||
@@ -190,7 +187,14 @@ class Qwen2VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCas
|
||||
Model tester for `Qwen2VLForConditionalGeneration`.
|
||||
"""
|
||||
|
||||
all_model_classes = (Qwen2VLForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_model_classes = (
|
||||
(
|
||||
Qwen2VLModel,
|
||||
Qwen2VLForConditionalGeneration,
|
||||
)
|
||||
if is_torch_available()
|
||||
else ()
|
||||
)
|
||||
pipeline_model_mapping = {"image-text-to-text": Qwen2VLForConditionalGeneration}
|
||||
test_pruning = False
|
||||
test_head_masking = False
|
||||
@@ -226,20 +230,21 @@ class Qwen2VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCas
|
||||
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config).to(torch_device)
|
||||
_ = model(**input_dict) # successful forward with no modifications
|
||||
curr_input_dict = copy.deepcopy(input_dict)
|
||||
_ = model(**curr_input_dict) # successfull forward with no modifications
|
||||
|
||||
# remove one image but leave the image token in text
|
||||
patch_size = config.vision_config.patch_size
|
||||
one_img_length = (self.model_tester.image_size**2) // (patch_size**2)
|
||||
input_dict["pixel_values"] = input_dict["pixel_values"][-one_img_length:, ...]
|
||||
input_dict["image_grid_thw"] = input_dict["image_grid_thw"][-1:, ...]
|
||||
curr_input_dict["pixel_values"] = curr_input_dict["pixel_values"][-one_img_length:, ...]
|
||||
curr_input_dict["image_grid_thw"] = curr_input_dict["image_grid_thw"][-1:, ...]
|
||||
with self.assertRaises(ValueError):
|
||||
_ = model(**input_dict)
|
||||
_ = model(**curr_input_dict)
|
||||
|
||||
# simulate multi-image case by concatenating inputs where each has exactly one image/image-token
|
||||
input_ids = input_dict["input_ids"][:1]
|
||||
pixel_values = input_dict["pixel_values"][:one_img_length]
|
||||
image_grid_thw = input_dict["image_grid_thw"][:1]
|
||||
input_ids = curr_input_dict["input_ids"][:1]
|
||||
pixel_values = curr_input_dict["pixel_values"][:one_img_length]
|
||||
image_grid_thw = curr_input_dict["image_grid_thw"][:1]
|
||||
input_ids = torch.cat([input_ids, input_ids], dim=0)
|
||||
|
||||
# one image and two image tokens raise an error
|
||||
@@ -262,11 +267,11 @@ class Qwen2VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCas
|
||||
model = model_class(config).to(torch_device)
|
||||
|
||||
# Generate and make sure rope_deltas are not `None`
|
||||
self.assertTrue(model.rope_deltas is None)
|
||||
self.assertTrue(model.model.rope_deltas is None)
|
||||
generation_output = model.generate(
|
||||
**input_dict, max_new_tokens=4, return_dict_in_generate=True, output_logits=True
|
||||
)
|
||||
self.assertTrue(model.rope_deltas is not None)
|
||||
self.assertTrue(model.model.rope_deltas is not None)
|
||||
|
||||
# Now if we try to do forward pass, we should get new rope logits, because cache is not passed
|
||||
forward_output = model(**input_dict)
|
||||
@@ -320,6 +325,29 @@ class Qwen2VLModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.TestCas
|
||||
def test_save_load_fast_init_from_base(self):
|
||||
pass
|
||||
|
||||
# The multimodal base model embeds will not match ids, due to pixel values. We can't change base test
|
||||
# because in some models `pixel_values` are required. Will be fixed when we add support for merging `embeds+pixels`
|
||||
# TODO: @raushan
|
||||
def test_inputs_embeds_matches_input_ids(self):
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config)
|
||||
model.to(torch_device)
|
||||
model.eval()
|
||||
|
||||
inputs = self._prepare_for_class(inputs_dict, model_class)
|
||||
input_ids = inputs["input_ids"]
|
||||
del inputs["input_ids"]
|
||||
del inputs["pixel_values"]
|
||||
|
||||
inputs_embeds = model.get_input_embeddings()(input_ids)
|
||||
|
||||
with torch.no_grad():
|
||||
out_ids = model(input_ids=input_ids, **inputs)[0]
|
||||
out_embeds = model(inputs_embeds=inputs_embeds, **inputs)[0]
|
||||
torch.testing.assert_close(out_embeds, out_ids)
|
||||
|
||||
|
||||
@require_torch
|
||||
class Qwen2VLIntegrationTest(unittest.TestCase):
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
# limitations under the License.
|
||||
"""Testing suite for the PyTorch VideoLlava model."""
|
||||
|
||||
import copy
|
||||
import unittest
|
||||
|
||||
import numpy as np
|
||||
@@ -23,6 +24,7 @@ from parameterized import parameterized
|
||||
from transformers import (
|
||||
VideoLlavaConfig,
|
||||
VideoLlavaForConditionalGeneration,
|
||||
VideoLlavaModel,
|
||||
VideoLlavaProcessor,
|
||||
is_torch_available,
|
||||
is_vision_available,
|
||||
@@ -190,7 +192,14 @@ class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe
|
||||
Model tester for `VideoLlavaForConditionalGeneration`.
|
||||
"""
|
||||
|
||||
all_model_classes = (VideoLlavaForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_model_classes = (
|
||||
(
|
||||
VideoLlavaModel,
|
||||
VideoLlavaForConditionalGeneration,
|
||||
)
|
||||
if is_torch_available()
|
||||
else ()
|
||||
)
|
||||
fx_compatible = False
|
||||
test_pruning = False
|
||||
test_resize_embeddings = True
|
||||
@@ -235,46 +244,49 @@ class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe
|
||||
def test_mixed_input(self):
|
||||
config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
for model_class in self.all_model_classes:
|
||||
curr_inputs = copy.deepcopy(inputs)
|
||||
model = model_class(config).to(torch_device).eval()
|
||||
# test that the forward does not fail
|
||||
with torch.no_grad():
|
||||
_ = model(**inputs)
|
||||
_ = model(**curr_inputs)
|
||||
|
||||
# if we remove some images from inputs leaving only one
|
||||
# image number mismatch error should raise
|
||||
inputs["pixel_values_images"] = inputs["pixel_values_images"][:1]
|
||||
curr_inputs["pixel_values_images"] = curr_inputs["pixel_values_images"][:1]
|
||||
with self.assertRaises(ValueError):
|
||||
_ = model(**inputs)
|
||||
_ = model(**curr_inputs)
|
||||
|
||||
def test_video_only_input(self):
|
||||
config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
for model_class in self.all_model_classes:
|
||||
curr_inputs = copy.deepcopy(inputs)
|
||||
model = model_class(config).to(torch_device).eval()
|
||||
# replace image token id with dummy id
|
||||
# Error will be raised as num-image-tokens and num-of-image-embeds mismatch
|
||||
inputs["input_ids"][:, : self.model_tester.num_image_tokens] = 2
|
||||
curr_inputs["input_ids"][:, : self.model_tester.num_image_tokens] = 2
|
||||
with self.assertRaises(ValueError):
|
||||
_ = model(**inputs)
|
||||
_ = model(**curr_inputs)
|
||||
|
||||
inputs["pixel_values_images"] = None
|
||||
_ = model(**inputs)
|
||||
curr_inputs["pixel_values_images"] = None
|
||||
_ = model(**curr_inputs)
|
||||
|
||||
def test_image_only_input(self):
|
||||
config, inputs = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
for model_class in self.all_model_classes:
|
||||
curr_inputs = copy.deepcopy(inputs)
|
||||
model = model_class(config).to(torch_device).eval()
|
||||
# set dummy id, which is not video token id
|
||||
# Error will be raised as num-video-tokens and num-of-video-embeds mismatch
|
||||
inputs["input_ids"][
|
||||
curr_inputs["input_ids"][
|
||||
:,
|
||||
self.model_tester.num_image_tokens : self.model_tester.num_image_tokens
|
||||
+ self.model_tester.num_video_tokens,
|
||||
] = 2
|
||||
with self.assertRaises(ValueError):
|
||||
_ = model(**inputs)
|
||||
_ = model(**curr_inputs)
|
||||
|
||||
inputs["pixel_values_videos"] = None
|
||||
_ = model(**inputs)
|
||||
curr_inputs["pixel_values_videos"] = None
|
||||
_ = model(**curr_inputs)
|
||||
|
||||
def test_batching_equivalence(self):
|
||||
def recursive_check(batched_object, single_row_object, model_name, key):
|
||||
@@ -386,16 +398,17 @@ class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe
|
||||
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config).to(torch_device)
|
||||
_ = model(**input_dict) # successful forward with no modifications
|
||||
curr_input_dict = copy.deepcopy(input_dict)
|
||||
_ = model(**curr_input_dict) # successfull forward with no modifications
|
||||
|
||||
# remove one image but leave the image token in text
|
||||
input_dict["pixel_values_images"] = input_dict["pixel_values_images"][-1:, ...]
|
||||
curr_input_dict["pixel_values_images"] = curr_input_dict["pixel_values_images"][-1:, ...]
|
||||
with self.assertRaises(ValueError):
|
||||
_ = model(**input_dict)
|
||||
_ = model(**curr_input_dict)
|
||||
|
||||
# simulate multi-image case by concatenating inputs where each has exactly one image/image-token
|
||||
input_ids = input_dict["input_ids"][:1]
|
||||
pixel_values = input_dict["pixel_values_images"][:1]
|
||||
input_ids = curr_input_dict["input_ids"][:1]
|
||||
pixel_values = curr_input_dict["pixel_values_images"][:1]
|
||||
input_ids = torch.cat([input_ids, input_ids], dim=0)
|
||||
|
||||
# one image and two image tokens raise an error
|
||||
@@ -429,7 +442,8 @@ class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe
|
||||
model = model_class(config).to(torch_device)
|
||||
# We should have the right number of input features,
|
||||
# and should be able to run a forward pass without exploding
|
||||
assert model.multi_modal_projector.linear_1.in_features == expected_features
|
||||
base_model = getattr(model, "model", model)
|
||||
assert base_model.multi_modal_projector.linear_1.in_features == expected_features
|
||||
model(**input_dict)
|
||||
|
||||
|
||||
|
||||
@@ -13,6 +13,7 @@
|
||||
# limitations under the License.
|
||||
"""Testing suite for the PyTorch VipLlava model."""
|
||||
|
||||
import copy
|
||||
import unittest
|
||||
|
||||
import requests
|
||||
@@ -22,6 +23,7 @@ from transformers import (
|
||||
AutoProcessor,
|
||||
VipLlavaConfig,
|
||||
VipLlavaForConditionalGeneration,
|
||||
VipLlavaModel,
|
||||
is_torch_available,
|
||||
is_vision_available,
|
||||
)
|
||||
@@ -165,7 +167,14 @@ class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTest
|
||||
Model tester for `VipLlavaForConditionalGeneration`.
|
||||
"""
|
||||
|
||||
all_model_classes = (VipLlavaForConditionalGeneration,) if is_torch_available() else ()
|
||||
all_model_classes = (
|
||||
(
|
||||
VipLlavaModel,
|
||||
VipLlavaForConditionalGeneration,
|
||||
)
|
||||
if is_torch_available()
|
||||
else ()
|
||||
)
|
||||
pipeline_model_mapping = {"image-text-to-text": VipLlavaForConditionalGeneration} if is_torch_available() else {}
|
||||
fx_compatible = False
|
||||
test_pruning = False
|
||||
@@ -236,16 +245,17 @@ class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTest
|
||||
config, input_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
for model_class in self.all_model_classes:
|
||||
model = model_class(config).to(torch_device)
|
||||
_ = model(**input_dict) # successful forward with no modifications
|
||||
curr_input_dict = copy.deepcopy(input_dict) # in=place modifications further
|
||||
_ = model(**curr_input_dict) # successful forward with no modifications
|
||||
|
||||
# remove one image but leave the image token in text
|
||||
input_dict["pixel_values"] = input_dict["pixel_values"][-1:, ...]
|
||||
curr_input_dict["pixel_values"] = curr_input_dict["pixel_values"][-1:, ...]
|
||||
with self.assertRaises(ValueError):
|
||||
_ = model(**input_dict)
|
||||
_ = model(**curr_input_dict)
|
||||
|
||||
# simulate multi-image case by concatenating inputs where each has exactly one image/image-token
|
||||
input_ids = input_dict["input_ids"][:1]
|
||||
pixel_values = input_dict["pixel_values"][:1]
|
||||
input_ids = curr_input_dict["input_ids"][:1]
|
||||
pixel_values = curr_input_dict["pixel_values"][:1]
|
||||
input_ids = torch.cat([input_ids, input_ids], dim=0)
|
||||
|
||||
# one image and two image tokens raise an error
|
||||
@@ -284,7 +294,8 @@ class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTest
|
||||
model = model_class(config).to(torch_device)
|
||||
# We should have the right number of input features,
|
||||
# and should be able to run a forward pass without exploding
|
||||
assert model.multi_modal_projector.linear_1.in_features == expected_features
|
||||
base_model = getattr(model, "model", model)
|
||||
assert base_model.multi_modal_projector.linear_1.in_features == expected_features
|
||||
model(**input_dict)
|
||||
|
||||
@unittest.skip(
|
||||
@@ -311,6 +322,10 @@ class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTest
|
||||
def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
|
||||
pass
|
||||
|
||||
@unittest.skip("LLaVA vision backbones doesn't support flex attention yet")
|
||||
def test_flex_attention_with_grads(self):
|
||||
pass
|
||||
|
||||
|
||||
@require_torch
|
||||
class VipLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):
|
||||
|
||||
Reference in New Issue
Block a user