Attn implementation for composite models (#32238)

* first try

* codestyle

* idefics2 is happy

* [run-slow] llava, llava_next, video_llava, vipllava, llava_next_video, idefics, idefics2, kosmos2, fuyu, blip, blip_2, instructblip, instructblipvideo, paligemma

* fix-copies

* [run-slow] llava, llava_next, video_llava, vipllava, llava_next_video, idefics, idefics2, kosmos2, fuyu, blip, blip_2, instructblip, instructblipvideo

* blip-2 needs to init vision from config

* when was this removed O_o

* minor fix

* tests

* this way?

* tests

* model-agnostic code

* codestyle

* add tests for idefics

* modify general test for VLMs

* no generation test for vlm yet!

* no generation test here also

* wanr in VIT-SDPA if output attn

* add more tests

* user can pass dict as attn impl

* repo consistency

* update

* muicgen

* no prints

* forgot speech enc-dec and clip

* how many composite models we have?

* musicgen meelody is same as mudicgen

* +siglip

* fix tests + add some more

* remove idefics custom overriden code

* make idefics2 automappable

* nits

* skip tests

* doctests

* Update src/transformers/models/idefics2/configuration_idefics2.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/models/clip/test_modeling_clip.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/models/idefics2/test_modeling_idefics2.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update tests/models/idefics2/test_modeling_idefics2.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* Update src/transformers/configuration_utils.py

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>

* major update, no need for automap

* clean up

* add FA2 test

* more tests

* style

* skip tests

* why did these started failing now?

* no attributes for FA2 needed

* one tiny test

* address comment about FA2 false warning

* style

* add new models and resolve conflicts

* fix copies

* let it be this way for now, come back tomorrow to review

* some more fixes

* update

* more updates

* update

* fix copies

* style and tests

* another big update

* fix tests

* fix tests

* update

* another update

* fix tests

* fix copies

* fix tests

---------

Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
This commit is contained in:
Raushan Turganbay
2024-10-22 06:54:44 +02:00
committed by GitHub
parent 32590b5ecb
commit 21d5025826
64 changed files with 1925 additions and 713 deletions

View File

@@ -27,6 +27,7 @@ from transformers.testing_utils import (
require_torch_fp16,
require_torch_gpu,
require_torch_multi_accelerator,
require_torch_sdpa,
require_vision,
slow,
torch_device,
@@ -456,6 +457,7 @@ class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationT
test_resize_embeddings = False
test_attention_outputs = False
test_torchscript = False
_is_composite = True
def setUp(self):
self.model_tester = Blip2ForConditionalGenerationDecoderOnlyModelTester(self)
@@ -488,6 +490,66 @@ class Blip2ForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, GenerationT
def test_save_load_fast_init_to_base(self):
pass
@require_torch_sdpa
def test_sdpa_can_dispatch_composite_models(self):
"""
Tests if composite models dispatch correctly on SDPA/eager when requested so when loading the model.
This tests only by looking at layer names, as usually SDPA layers are calles "SDPAAttention".
In contrast to the above test, this one checks if the "config._attn_implamentation" is a dict after the model
is loaded, because we manually replicate requested attn implementation on each sub-config when loading.
See https://github.com/huggingface/transformers/pull/32238 for more info
The test tries to cover most general cases of composite models, VLMs with vision and text configs. Any model
that has a different set of sub-configs has to overwrite this test.
"""
if not self.has_attentions:
self.skipTest(reason="Model architecture does not support attentions")
if not self._is_composite:
self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
for model_class in self.all_model_classes:
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model_sdpa = model_class.from_pretrained(tmpdirname)
model_sdpa = model_sdpa.eval().to(torch_device)
text_attn = "sdpa" if model.language_model._supports_sdpa else "eager"
vision_attn = "sdpa" if model.vision_model._supports_sdpa else "eager"
qformer_attn = "sdpa" if model.qformer._supports_sdpa else "eager"
# `None` as it is the requested one which will be assigned to each sub-config
# Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present)
self.assertTrue(model.language_model.config._attn_implementation == text_attn)
self.assertTrue(model.vision_model.config._attn_implementation == vision_attn)
self.assertTrue(model.qformer.config._attn_implementation == qformer_attn)
model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager")
model_eager = model_eager.eval().to(torch_device)
self.assertTrue(model_eager.config._attn_implementation == "eager")
self.assertTrue(model_eager.language_model.config._attn_implementation == "eager")
self.assertTrue(model_eager.vision_model.config._attn_implementation == "eager")
self.assertTrue(model_eager.qformer.config._attn_implementation == "eager")
for name, submodule in model_eager.named_modules():
class_name = submodule.__class__.__name__
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
raise ValueError("The eager model should not have SDPA attention layers")
has_sdpa = False
for name, submodule in model_sdpa.named_modules():
class_name = submodule.__class__.__name__
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
has_sdpa = True
break
if not has_sdpa and any(
module_attn == "sdpa" for module_attn in [text_attn, vision_attn, qformer_attn]
):
raise ValueError("The SDPA model should have SDPA attention layers")
def test_forward_signature(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
@@ -715,6 +777,7 @@ class Blip2ModelTest(ModelTesterMixin, PipelineTesterMixin, GenerationTesterMixi
test_resize_embeddings = False
test_attention_outputs = False
test_torchscript = False
_is_composite = True
# TODO: Fix the failed tests
def is_pipeline_test_to_skip(
@@ -768,6 +831,66 @@ class Blip2ModelTest(ModelTesterMixin, PipelineTesterMixin, GenerationTesterMixi
def test_cpu_offload(self):
pass
@require_torch_sdpa
def test_sdpa_can_dispatch_composite_models(self):
"""
Tests if composite models dispatch correctly on SDPA/eager when requested so when loading the model.
This tests only by looking at layer names, as usually SDPA layers are calles "SDPAAttention".
In contrast to the above test, this one checks if the "config._attn_implamentation" is a dict after the model
is loaded, because we manually replicate requested attn implementation on each sub-config when loading.
See https://github.com/huggingface/transformers/pull/32238 for more info
The test tries to cover most general cases of composite models, VLMs with vision and text configs. Any model
that has a different set of sub-configs has to overwrite this test.
"""
if not self.has_attentions:
self.skipTest(reason="Model architecture does not support attentions")
if not self._is_composite:
self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
for model_class in self.all_model_classes:
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model_sdpa = model_class.from_pretrained(tmpdirname)
model_sdpa = model_sdpa.eval().to(torch_device)
text_attn = "sdpa" if model.language_model._supports_sdpa else "eager"
vision_attn = "sdpa" if model.vision_model._supports_sdpa else "eager"
qformer_attn = "sdpa" if model.qformer._supports_sdpa else "eager"
# `None` as it is the requested one which will be assigned to each sub-config
# Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present)
self.assertTrue(model.language_model.config._attn_implementation == text_attn)
self.assertTrue(model.vision_model.config._attn_implementation == vision_attn)
self.assertTrue(model.qformer.config._attn_implementation == qformer_attn)
model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager")
model_eager = model_eager.eval().to(torch_device)
self.assertTrue(model_eager.config._attn_implementation == "eager")
self.assertTrue(model_eager.language_model.config._attn_implementation == "eager")
self.assertTrue(model_eager.vision_model.config._attn_implementation == "eager")
self.assertTrue(model_eager.qformer.config._attn_implementation == "eager")
for name, submodule in model_eager.named_modules():
class_name = submodule.__class__.__name__
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
raise ValueError("The eager model should not have SDPA attention layers")
has_sdpa = False
for name, submodule in model_sdpa.named_modules():
class_name = submodule.__class__.__name__
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
has_sdpa = True
break
if not has_sdpa and any(
module_attn == "sdpa" for module_attn in [text_attn, vision_attn, qformer_attn]
):
raise ValueError("The SDPA model should have SDPA attention layers")
def test_forward_signature(self):
config, _ = self.model_tester.prepare_config_and_inputs_for_common()

View File

@@ -191,6 +191,53 @@ class CLIPModelTesterMixin(ModelTesterMixin):
different output logits, and are not supposed to be used or tested with padding_side="left".
"""
def test_sdpa_can_dispatch_composite_models(self):
for model_class in self.all_model_classes:
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
# Load the model with SDPA
model_sdpa = model_class.from_pretrained(tmpdirname)
model_sdpa = model_sdpa.eval().to(torch_device)
# Load model with eager attention
model_eager = model_class.from_pretrained(
tmpdirname,
attn_implementation="eager",
)
model_eager = model_eager.eval().to(torch_device)
# SigLip has one shared cls attr for all models, so we assign both submodels heer
vision_attn = text_attn = "sdpa" if model._supports_sdpa else "eager"
# `None` as it is the requested one which will be assigned to each sub-config
# Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present)
if hasattr(model_sdpa, "vision_model") and hasattr(model_sdpa, "text_model"):
self.assertTrue(model_sdpa.vision_model.config._attn_implementation == vision_attn)
self.assertTrue(model_sdpa.text_model.config._attn_implementation == text_attn)
self.assertTrue(model_eager.vision_model.config._attn_implementation == "eager")
self.assertTrue(model_eager.text_model.config._attn_implementation == "eager")
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
self.assertTrue(model_eager.config._attn_implementation == "eager")
for name, submodule in model_eager.named_modules():
class_name = submodule.__class__.__name__
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
raise ValueError("The eager model should not have SDPA attention layers")
has_sdpa = False
for name, submodule in model_sdpa.named_modules():
class_name = submodule.__class__.__name__
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
has_sdpa = True
break
if not has_sdpa and model_sdpa.config.model_type != "falcon":
raise ValueError("The SDPA model should have SDPA attention layers")
def test_eager_matches_sdpa_inference(
self,
torch_dtype: str,
@@ -252,24 +299,6 @@ class CLIPModelTesterMixin(ModelTesterMixin):
)
model_eager = model_eager.eval().to(torch_device)
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
self.assertTrue(model_eager.config._attn_implementation == "eager")
for name, submodule in model_eager.named_modules():
class_name = submodule.__class__.__name__
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
raise ValueError("The eager model should not have SDPA attention layers")
has_sdpa = False
for name, submodule in model_sdpa.named_modules():
class_name = submodule.__class__.__name__
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
has_sdpa = True
break
if not has_sdpa:
raise ValueError("The SDPA model should have SDPA attention layers")
# We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving the model each time,
# but it would be nicer to have an efficient way to use parameterized.expand
cases = [
@@ -461,6 +490,10 @@ class CLIPVisionModelTest(CLIPModelTesterMixin, unittest.TestCase):
use_attention_mask_options=(None,),
)
@require_torch_sdpa
def test_sdpa_can_dispatch_composite_models(self):
super().test_sdpa_can_dispatch_composite_models()
class CLIPTextModelTester:
def __init__(
@@ -639,6 +672,10 @@ class CLIPTextModelTest(CLIPModelTesterMixin, unittest.TestCase):
use_attention_mask_options=(None, "right"), # "left" is not supported for text model
)
@require_torch_sdpa
def test_sdpa_can_dispatch_composite_models(self):
super().test_sdpa_can_dispatch_composite_models()
@require_torch_sdpa
def test_sdpa_can_dispatch_on_flash(self):
self.skipTest(reason="CLIPTextModel has two attention masks: `causal_attention_mask` and `attention_mask`")
@@ -704,6 +741,7 @@ class CLIPModelTest(CLIPModelTesterMixin, PipelineTesterMixin, unittest.TestCase
test_pruning = False
test_resize_embeddings = False
test_attention_outputs = False
_is_composite = True
def setUp(self):
self.model_tester = CLIPModelTester(self)
@@ -975,6 +1013,10 @@ class CLIPModelTest(CLIPModelTesterMixin, PipelineTesterMixin, unittest.TestCase
use_attention_mask_options=(None, "right"), # "left" is not supported for text model
)
@require_torch_sdpa
def test_sdpa_can_dispatch_composite_models(self):
super().test_sdpa_can_dispatch_composite_models()
@require_torch_sdpa
def test_sdpa_can_dispatch_on_flash(self):
self.skipTest(reason="CLIP text tower has two attention masks: `causal_attention_mask` and `attention_mask`")
@@ -1104,6 +1146,7 @@ class CLIPForImageClassificationModelTest(CLIPModelTesterMixin, PipelineTesterMi
test_pruning = False
test_resize_embeddings = False
test_attention_outputs = False
_is_composite = True
def setUp(self):
self.model_tester = CLIPForImageClassificationModelTester(self)
@@ -1143,6 +1186,10 @@ class CLIPForImageClassificationModelTest(CLIPModelTesterMixin, PipelineTesterMi
use_attention_mask_options=(None,),
)
@require_torch_sdpa
def test_sdpa_can_dispatch_composite_models(self):
super().test_sdpa_can_dispatch_composite_models()
# We will verify our results on an image of cute cats
def prepare_img():

View File

@@ -18,7 +18,14 @@ import tempfile
import unittest
from transformers import is_torch_available, logging
from transformers.testing_utils import CaptureLogger, require_deterministic_for_xpu, require_torch, slow, torch_device
from transformers.testing_utils import (
CaptureLogger,
require_deterministic_for_xpu,
require_torch,
require_torch_sdpa,
slow,
torch_device,
)
from ...test_modeling_common import ids_tensor
from ..bart.test_modeling_bart import BartStandaloneDecoderModelTester
@@ -54,6 +61,8 @@ if is_torch_available():
@require_torch
class EncoderDecoderMixin:
supports_sdpa = False
def get_encoder_decoder_model(self, config, decoder_config):
raise NotImplementedError
@@ -670,6 +679,67 @@ class EncoderDecoderMixin:
max_diff = np.amax(np.abs(out_1 - out_2))
self.assertLessEqual(max_diff, 1e-5)
@require_torch_sdpa
def test_sdpa_can_dispatch_composite_models(self):
if not self.supports_sdpa:
self.skipTest("SDPA is not supported")
inputs_dict = self.prepare_config_and_inputs()
encoder_config, decoder_config = inputs_dict["config"], inputs_dict["decoder_config"]
config = EncoderDecoderConfig.from_encoder_decoder_configs(
encoder_config=encoder_config, decoder_config=decoder_config
)
model = EncoderDecoderModel(config=config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model_sdpa = EncoderDecoderModel.from_pretrained(tmpdirname)
model_sdpa = model_sdpa.eval().to(torch_device)
# see https://github.com/huggingface/transformers/pull/32238
# Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present)
encoder_attn = "sdpa" if model.encoder._supports_sdpa else "eager"
decoder_attn = "sdpa" if model.decoder._supports_sdpa else "eager"
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
self.assertTrue(model_sdpa.encoder.config._attn_implementation == encoder_attn)
self.assertTrue(model_sdpa.decoder.config._attn_implementation == decoder_attn)
# Also test that nothing break if we request SDPA explicitly, when both sub-parts support it.
# If the model supports sdpa (i.e. all of sub-models supports it) we'll dispatch safely
# Otherwise we should raise error that SDPA is not supported, as some of the sub-models doesn't support
if encoder_attn == "sdpa" and decoder_attn == "sdpa":
model_sdpa_explicit = EncoderDecoderModel.from_pretrained(tmpdirname, attn_implementation="sdpa")
model_sdpa_explicit = model_sdpa_explicit.eval().to(torch_device)
self.assertTrue(model_sdpa_explicit.config._attn_implementation == "sdpa")
else:
with self.assertRaises(ValueError):
model_sdpa_explicit = EncoderDecoderModel.from_pretrained(tmpdirname, attn_implementation="sdpa")
model_eager = EncoderDecoderModel.from_pretrained(
tmpdirname,
attn_implementation="eager",
)
model_eager = model_eager.eval().to(torch_device)
self.assertTrue(model_eager.config._attn_implementation == "eager")
self.assertTrue(model_eager.encoder.config._attn_implementation == "eager")
self.assertTrue(model_eager.decoder.config._attn_implementation == "eager")
for name, submodule in model_eager.named_modules():
class_name = submodule.__class__.__name__
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
raise ValueError("The eager model should not have SDPA attention layers")
has_sdpa = False
for name, submodule in model_sdpa.named_modules():
class_name = submodule.__class__.__name__
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
has_sdpa = True
break
if not has_sdpa:
raise ValueError("The SDPA model should have SDPA attention layers")
@require_torch
class BertEncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase):
@@ -949,6 +1019,8 @@ class RoBertaEncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase):
@require_torch
class GPT2EncoderDecoderModelTest(EncoderDecoderMixin, unittest.TestCase):
supports_sdpa = True
def get_encoder_decoder_model(self, config, decoder_config):
encoder_model = BertModel(config)
decoder_model = GPT2LMHeadModel(decoder_config)

View File

@@ -88,6 +88,10 @@ class Gemma2ModelTest(GemmaModelTest, unittest.TestCase):
def test_model_outputs_equivalence(self, **kwargs):
pass
@unittest.skip("Gemma2's forcefully disables sdpa due to softcapping")
def test_sdpa_can_dispatch_non_composite_models(self):
pass
@parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
@unittest.skip("Gemma2's eager attn/sdpa attn outputs are expected to be different")
def test_eager_matches_sdpa_inference(self):

View File

@@ -580,11 +580,9 @@ class IdeficsModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
model = IdeficsModel.from_pretrained(model_name)
self.assertIsNotNone(model)
@require_torch_sdpa
@slow
@parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
def test_eager_matches_sdpa_inference(self, torch_dtype: str):
self.skipTest(reason="Idefics has a hard requirement on SDPA, skipping this test")
@unittest.skip("Idefics has a hard requirement on SDPA")
def test_sdpa_can_dispatch_non_composite_models(self):
pass
@unittest.skipIf(not is_torch_greater_or_equal_than_2_0, reason="pytorch 2.0 or higher is required")
@@ -806,6 +804,10 @@ class IdeficsForVisionText2TextTest(IdeficsModelTest, GenerationTesterMixin, uni
def test_training_gradient_checkpointing_use_reentrant_false(self):
pass
@unittest.skip("Idefics has a hard requirement on SDPA")
def test_sdpa_can_dispatch_non_composite_models(self):
pass
@unittest.skipIf(not is_torch_greater_or_equal_than_2_0, reason="pytorch 2.0 or higher is required")
@require_torch

View File

@@ -16,6 +16,7 @@
import copy
import gc
import tempfile
import unittest
from io import BytesIO
@@ -36,6 +37,7 @@ from transformers.testing_utils import (
require_torch,
require_torch_gpu,
require_torch_multi_gpu,
require_torch_sdpa,
slow,
torch_device,
)
@@ -180,6 +182,7 @@ class Idefics2ModelTest(ModelTesterMixin, unittest.TestCase):
test_pruning = False
test_resize_embeddings = True
test_head_masking = False
_is_composite = True
def setUp(self):
self.model_tester = Idefics2VisionText2TextModelTester(self)
@@ -327,6 +330,43 @@ class Idefics2ModelTest(ModelTesterMixin, unittest.TestCase):
# Check that the model can still do a forward pass successfully (every parameter should be resized)
model(**self._prepare_for_class(inputs_dict, model_class))
@require_torch_sdpa
def test_sdpa_can_dispatch_composite_models(self):
for model_class in self.all_model_classes:
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model_sdpa = model_class.from_pretrained(tmpdirname)
model_sdpa = model_sdpa.eval().to(torch_device)
vision_attn = None if model.vision_model._supports_sdpa else "eager"
perceiver_attn = None if model.connector.perceiver_resampler._supports_sdpa else "eager"
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
self.assertTrue(model_sdpa.vision_model.config._attn_implementation == vision_attn)
self.assertTrue(model_sdpa.connector.perceiver_resampler.config._attn_implementation == perceiver_attn)
model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager")
model_eager = model_eager.eval().to(torch_device)
self.assertTrue(model_eager.config._attn_implementation == "eager")
self.assertTrue(model_eager.vision_model.config._attn_implementation == "eager")
self.assertTrue(model_sdpa.connector.perceiver_resampler.config._attn_implementation == "eager")
for name, submodule in model_eager.named_modules():
class_name = submodule.__class__.__name__
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
raise ValueError("The eager model should not have SDPA attention layers")
has_sdpa = False
for name, submodule in model_sdpa.named_modules():
class_name = submodule.__class__.__name__
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
has_sdpa = True
break
if not has_sdpa and model_sdpa.config.model_type != "falcon":
raise ValueError("The SDPA model should have SDPA attention layers")
@require_torch
class Idefics2ForConditionalGenerationModelTest(GenerationTesterMixin, ModelTesterMixin, unittest.TestCase):

View File

@@ -32,6 +32,7 @@ from transformers.testing_utils import (
require_accelerate,
require_bitsandbytes,
require_torch,
require_torch_sdpa,
require_vision,
slow,
torch_device,
@@ -460,6 +461,7 @@ class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, Gene
test_resize_embeddings = False
test_attention_outputs = False
test_torchscript = False
_is_composite = True
def setUp(self):
self.model_tester = InstructBlipForConditionalGenerationDecoderOnlyModelTester(self)
@@ -529,6 +531,66 @@ class InstructBlipForConditionalGenerationDecoderOnlyTest(ModelTesterMixin, Gene
model = InstructBlipForConditionalGeneration.from_pretrained(model_name)
self.assertIsNotNone(model)
@require_torch_sdpa
def test_sdpa_can_dispatch_composite_models(self):
"""
Tests if composite models dispatch correctly on SDPA/eager when requested so when loading the model.
This tests only by looking at layer names, as usually SDPA layers are calles "SDPAAttention".
In contrast to the above test, this one checks if the "config._attn_implamentation" is a dict after the model
is loaded, because we manually replicate requested attn implementation on each sub-config when loading.
See https://github.com/huggingface/transformers/pull/32238 for more info
The test tries to cover most general cases of composite models, VLMs with vision and text configs. Any model
that has a different set of sub-configs has to overwrite this test.
"""
if not self.has_attentions:
self.skipTest(reason="Model architecture does not support attentions")
if not self._is_composite:
self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
for model_class in self.all_model_classes:
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model_sdpa = model_class.from_pretrained(tmpdirname)
model_sdpa = model_sdpa.eval().to(torch_device)
text_attn = "sdpa" if model.language_model._supports_sdpa else "eager"
vision_attn = "sdpa" if model.vision_model._supports_sdpa else "eager"
qformer_attn = "sdpa" if model.qformer._supports_sdpa else "eager"
# `None` as it is the requested one which will be assigned to each sub-config
# Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present)
self.assertTrue(model.language_model.config._attn_implementation == text_attn)
self.assertTrue(model.vision_model.config._attn_implementation == vision_attn)
self.assertTrue(model.qformer.config._attn_implementation == qformer_attn)
model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager")
model_eager = model_eager.eval().to(torch_device)
self.assertTrue(model_eager.config._attn_implementation == "eager")
self.assertTrue(model_eager.language_model.config._attn_implementation == "eager")
self.assertTrue(model_eager.vision_model.config._attn_implementation == "eager")
self.assertTrue(model_eager.qformer.config._attn_implementation == "eager")
for name, submodule in model_eager.named_modules():
class_name = submodule.__class__.__name__
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
raise ValueError("The eager model should not have SDPA attention layers")
has_sdpa = False
for name, submodule in model_sdpa.named_modules():
class_name = submodule.__class__.__name__
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
has_sdpa = True
break
if not has_sdpa and any(
module_attn == "sdpa" for module_attn in [text_attn, vision_attn, qformer_attn]
):
raise ValueError("The SDPA model should have SDPA attention layers")
# We will verify our results on an image of cute cats
def prepare_img():

View File

@@ -32,6 +32,7 @@ from transformers.testing_utils import (
require_accelerate,
require_bitsandbytes,
require_torch,
require_torch_sdpa,
require_vision,
slow,
torch_device,
@@ -481,6 +482,7 @@ class InstructBlipVideoForConditionalGenerationDecoderOnlyTest(
test_resize_embeddings = False
test_attention_outputs = False
test_torchscript = False
_is_composite = True
def setUp(self):
self.model_tester = InstructBlipVideoForConditionalGenerationDecoderOnlyModelTester(self)
@@ -550,6 +552,66 @@ class InstructBlipVideoForConditionalGenerationDecoderOnlyTest(
model = InstructBlipVideoForConditionalGeneration.from_pretrained(model_name)
self.assertIsNotNone(model)
@require_torch_sdpa
def test_sdpa_can_dispatch_composite_models(self):
"""
Tests if composite models dispatch correctly on SDPA/eager when requested so when loading the model.
This tests only by looking at layer names, as usually SDPA layers are calles "SDPAAttention".
In contrast to the above test, this one checks if the "config._attn_implamentation" is a dict after the model
is loaded, because we manually replicate requested attn implementation on each sub-config when loading.
See https://github.com/huggingface/transformers/pull/32238 for more info
The test tries to cover most general cases of composite models, VLMs with vision and text configs. Any model
that has a different set of sub-configs has to overwrite this test.
"""
if not self.has_attentions:
self.skipTest(reason="Model architecture does not support attentions")
if not self._is_composite:
self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
for model_class in self.all_model_classes:
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model_sdpa = model_class.from_pretrained(tmpdirname)
model_sdpa = model_sdpa.eval().to(torch_device)
text_attn = "sdpa" if model.language_model._supports_sdpa else "eager"
vision_attn = "sdpa" if model.vision_model._supports_sdpa else "eager"
qformer_attn = "sdpa" if model.qformer._supports_sdpa else "eager"
# `None` as it is the requested one which will be assigned to each sub-config
# Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present)
self.assertTrue(model.language_model.config._attn_implementation == text_attn)
self.assertTrue(model.vision_model.config._attn_implementation == vision_attn)
self.assertTrue(model.qformer.config._attn_implementation == qformer_attn)
model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager")
model_eager = model_eager.eval().to(torch_device)
self.assertTrue(model_eager.config._attn_implementation == "eager")
self.assertTrue(model_eager.language_model.config._attn_implementation == "eager")
self.assertTrue(model_eager.vision_model.config._attn_implementation == "eager")
self.assertTrue(model_eager.qformer.config._attn_implementation == "eager")
for name, submodule in model_eager.named_modules():
class_name = submodule.__class__.__name__
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
raise ValueError("The eager model should not have SDPA attention layers")
has_sdpa = False
for name, submodule in model_sdpa.named_modules():
class_name = submodule.__class__.__name__
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
has_sdpa = True
break
if not has_sdpa and any(
module_attn == "sdpa" for module_attn in [text_attn, vision_attn, qformer_attn]
):
raise ValueError("The SDPA model should have SDPA attention layers")
# We will verify our results on an image of cute cats
def prepare_video():

View File

@@ -25,8 +25,17 @@ import requests
from transformers import AutoModelForImageTextToText, AutoProcessor, Kosmos2Config
from transformers.models.kosmos2.configuration_kosmos2 import Kosmos2TextConfig, Kosmos2VisionConfig
from transformers.testing_utils import IS_ROCM_SYSTEM, require_torch, require_vision, slow, torch_device
from transformers.utils import is_torch_available, is_vision_available
from transformers.testing_utils import (
IS_ROCM_SYSTEM,
require_torch,
require_vision,
slow,
torch_device,
)
from transformers.utils import (
is_torch_available,
is_vision_available,
)
from ...test_configuration_common import ConfigTester
from ...test_modeling_common import (
@@ -257,6 +266,7 @@ class Kosmos2ModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
test_pruning = False
test_resize_embeddings = False
test_attention_outputs = False
_is_composite = True
# TODO: `image-to-text` pipeline for this model needs Processor.
def is_pipeline_test_to_skip(

View File

@@ -186,6 +186,7 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterM
pipeline_model_mapping = {"image-to-text": LlavaForConditionalGeneration} if is_torch_available() else {}
test_pruning = False
test_head_masking = False
_is_composite = True
def setUp(self):
self.model_tester = LlavaVisionText2TextModelTester(self)
@@ -260,6 +261,16 @@ class LlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTesterM
def test_sdpa_can_dispatch_on_flash(self):
pass
@unittest.skip("FlashAttention only support fp16 and bf16 data type")
def test_flash_attn_2_fp32_ln(self):
pass
@unittest.skip(
"VLMs need lots of steps to prepare images/mask correctly to get pad-free inputs. Can be tested as part of LLM test"
)
def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
pass
@require_torch
class LlavaForConditionalGenerationIntegrationTest(unittest.TestCase):

View File

@@ -218,6 +218,7 @@ class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
all_generative_model_classes = (LlavaNextForConditionalGeneration,) if is_torch_available() else ()
test_pruning = False
test_head_masking = False
_is_composite = True
def setUp(self):
self.model_tester = LlavaNextVisionText2TextModelTester(self)
@@ -316,6 +317,16 @@ class LlavaNextForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
def test_sdpa_can_dispatch_on_flash(self):
pass
@unittest.skip("FlashAttention only support fp16 and bf16 data type")
def test_flash_attn_2_fp32_ln(self):
pass
@unittest.skip(
"VLMs need lots of steps to prepare images/mask correctly to get pad-free inputs. Can be tested as part of LLM test"
)
def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
pass
@require_torch
class LlavaNextForConditionalGenerationIntegrationTest(unittest.TestCase):

View File

@@ -236,6 +236,7 @@ class LlavaNextVideoForConditionalGenerationModelTest(ModelTesterMixin, Generati
all_generative_model_classes = (LlavaNextVideoForConditionalGeneration,) if is_torch_available() else ()
test_pruning = False
test_head_masking = False
_is_composite = True
def setUp(self):
self.model_tester = LlavaNextVideoVisionText2TextModelTester(self)
@@ -340,6 +341,16 @@ class LlavaNextVideoForConditionalGenerationModelTest(ModelTesterMixin, Generati
def test_sdpa_can_dispatch_on_flash(self):
pass
@unittest.skip("FlashAttention only support fp16 and bf16 data type")
def test_flash_attn_2_fp32_ln(self):
pass
@unittest.skip(
"VLMs need lots of steps to prepare images/mask correctly to get pad-free inputs. Can be tested as part of LLM test"
)
def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
pass
@require_torch
class LlavaNextVideoForConditionalGenerationIntegrationTest(unittest.TestCase):

View File

@@ -219,6 +219,7 @@ class LlavaOnevisionForConditionalGenerationModelTest(ModelTesterMixin, Generati
all_generative_model_classes = (LlavaOnevisionForConditionalGeneration,) if is_torch_available() else ()
test_pruning = False
test_head_masking = False
_is_composite = True
def setUp(self):
self.model_tester = LlavaOnevisionVisionText2TextModelTester(self)
@@ -306,6 +307,16 @@ class LlavaOnevisionForConditionalGenerationModelTest(ModelTesterMixin, Generati
def test_assisted_decoding_with_num_logits_to_keep(self):
pass
@unittest.skip("FlashAttention only support fp16 and bf16 data type")
def test_flash_attn_2_fp32_ln(self):
pass
@unittest.skip(
"VLMs need lots of steps to prepare images/mask correctly to get pad-free inputs. Can be tested as part of LLM test"
)
def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
pass
@require_torch
class LlavaOnevisionForConditionalGenerationIntegrationTest(unittest.TestCase):

View File

@@ -274,6 +274,7 @@ class MllamaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTester
test_pruning = False
test_head_masking = False
test_torchscript = False
_is_composite = True
def setUp(self):
self.model_tester = MllamaVisionText2TextModelTester(self)

View File

@@ -654,8 +654,6 @@ class MusicgenDecoderTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype)
model_sdpa = model_sdpa.eval().to(torch_device)
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
model_eager = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch_dtype,
@@ -663,20 +661,6 @@ class MusicgenDecoderTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
)
model_eager = model_eager.eval().to(torch_device)
self.assertTrue(model_eager.config._attn_implementation == "eager")
for name, submodule in model_eager.named_modules():
if "SdpaAttention" in submodule.__class__.__name__:
raise ValueError("The eager model should not have SDPA attention layers")
has_sdpa = False
for name, submodule in model_sdpa.named_modules():
if "SdpaAttention" in submodule.__class__.__name__:
has_sdpa = True
break
if not has_sdpa and model_sdpa.config.model_type != "falcon":
raise ValueError("The SDPA model should have SDPA attention layers")
# We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 8 times the model,
# but it would be nicer to have an efficient way to use parameterized.expand
fail_cases = []
@@ -1042,6 +1026,7 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
# not to test torchscript as the model tester doesn't prepare `input_values` and `padding_mask`
# (and `torchscript` hates `None` values).
test_torchscript = False
_is_composite = True
def setUp(self):
self.model_tester = MusicgenTester(self)
@@ -1420,7 +1405,7 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
@require_torch_gpu
@mark.flash_attn_test
@slow
# Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_inference_equivalence
# Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_inference_equivalence
def test_flash_attn_2_inference_equivalence(self):
for model_class in self.all_model_classes:
if not model_class._supports_flash_attn_2:
@@ -1432,7 +1417,9 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model_fa = model_class.from_pretrained(
tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
tmpdirname,
torch_dtype=torch.bfloat16,
attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None},
)
model_fa.to(torch_device)
@@ -1505,7 +1492,88 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
@require_torch_gpu
@mark.flash_attn_test
@slow
# Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_inference_equivalence_right_padding
def test_flash_attn_2_conversion(self):
if not self.has_attentions:
self.skipTest(reason="Model architecture does not support attentions")
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
if not model_class._supports_flash_attn_2:
self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None},
).to(torch_device)
for _, module in model.named_modules():
if "FlashAttention" in module.__class__.__name__:
return
self.assertTrue(False, "FlashAttention2 modules not found in model")
@require_torch_sdpa
@require_torch_gpu
@slow
def test_sdpa_can_dispatch_on_flash(self):
if not self.has_attentions:
self.skipTest(reason="Model architecture does not support attentions")
torch.compiler.reset()
compute_capability = torch.cuda.get_device_capability()
major, _ = compute_capability
if not torch.version.cuda or major < 8:
self.skipTest(reason="This test requires an NVIDIA GPU with compute capability >= 8.0")
for model_class in self.all_model_classes:
if not model_class._supports_sdpa:
self.skipTest(f"{model_class.__name__} does not support SDPA")
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
inputs_dict = self._prepare_for_class(inputs_dict, model_class)
if config.model_type in ["llava", "llava_next", "vipllava", "video_llava"]:
self.skipTest(
reason="Llava-like models currently (transformers==4.39.1) requires an attention_mask input"
)
if config.model_type in ["paligemma"]:
self.skipTest(
"PaliGemma-like models currently (transformers==4.41.0) requires an attention_mask input"
)
if config.model_type in ["idefics", "idefics2", "idefics3"]:
self.skipTest(reason="Idefics currently (transformers==4.39.1) requires an image_attention_mask input")
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
attn_implementation={"decoder": "sdpa", "audio_encoder": None, "text_encoder": None},
)
model.to(torch_device)
inputs_dict.pop("attention_mask", None)
inputs_dict.pop("decoder_attention_mask", None)
for name, inp in inputs_dict.items():
if isinstance(inp, torch.Tensor) and inp.dtype in [torch.float32, torch.float16]:
inputs_dict[name] = inp.to(torch.float16)
with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
_ = model(**inputs_dict)
@require_flash_attn
@require_torch_gpu
@mark.flash_attn_test
@slow
# Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_inference_equivalence_right_padding
def test_flash_attn_2_inference_equivalence_right_padding(self):
for model_class in self.all_model_classes:
if not model_class._supports_flash_attn_2:
@@ -1517,7 +1585,9 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model_fa = model_class.from_pretrained(
tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
tmpdirname,
torch_dtype=torch.bfloat16,
attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None},
)
model_fa.to(torch_device)
@@ -1587,7 +1657,7 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
@require_torch_gpu
@mark.flash_attn_test
@slow
# Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_left_padding
# Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_left_padding
def test_flash_attn_2_generate_left_padding(self):
# Ignore copy
for model_class in self.greedy_sample_model_classes:
@@ -1622,7 +1692,7 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
model = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
attn_implementation="flash_attention_2",
attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None},
low_cpu_mem_usage=True,
).to(torch_device)
@@ -1636,7 +1706,7 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
@require_torch_gpu
@mark.flash_attn_test
@slow
# Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_padding_right
# Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_padding_right
def test_flash_attn_2_generate_padding_right(self):
# Ignore copy
for model_class in self.greedy_sample_model_classes:
@@ -1670,7 +1740,7 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
model = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
attn_implementation="flash_attention_2",
attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None},
low_cpu_mem_usage=True,
).to(torch_device)
@@ -1684,7 +1754,7 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
@require_torch_gpu
@mark.flash_attn_test
@slow
# Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_use_cache
# Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_use_cache
def test_flash_attn_2_generate_use_cache(self):
max_new_tokens = 30
@@ -1713,7 +1783,7 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
model = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
attn_implementation="flash_attention_2",
attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None},
low_cpu_mem_usage=True,
).to(torch_device)
@@ -1726,6 +1796,53 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
use_cache=True,
)
@require_torch_sdpa
def test_sdpa_can_dispatch_composite_models(self):
if not self.has_attentions:
self.skipTest(reason="Model architecture does not support attentions")
if not self._is_composite:
self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
for model_class in self.all_model_classes:
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model_sdpa = model_class.from_pretrained(tmpdirname)
model_sdpa = model_sdpa.eval().to(torch_device)
audio_encoder_attn = "sdpa" if model.audio_encoder._supports_sdpa else "eager"
text_encoder_attn = "sdpa" if model.text_encoder._supports_sdpa else "eager"
decoder_attn = "sdpa" if model.decoder._supports_sdpa else "eager"
# `None` as it is the requested one which will be assigned to each sub-config
# Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present)
self.assertTrue(model_sdpa.audio_encoder.config._attn_implementation == audio_encoder_attn)
self.assertTrue(model_sdpa.text_encoder.config._attn_implementation == text_encoder_attn)
self.assertTrue(model_sdpa.decoder.config._attn_implementation == decoder_attn)
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager")
model_eager = model_eager.eval().to(torch_device)
self.assertTrue(model_eager.audio_encoder.config._attn_implementation == "eager")
self.assertTrue(model_eager.text_encoder.config._attn_implementation == "eager")
self.assertTrue(model_eager.decoder.config._attn_implementation == "eager")
self.assertTrue(model_eager.config._attn_implementation == "eager")
for name, submodule in model_eager.named_modules():
if "SdpaAttention" in submodule.__class__.__name__:
raise ValueError("The eager model should not have SDPA attention layers")
has_sdpa = False
for name, submodule in model_sdpa.named_modules():
if "SdpaAttention" in submodule.__class__.__name__:
has_sdpa = True
break
if not has_sdpa and model_sdpa.config.model_type != "falcon":
raise ValueError("The SDPA model should have SDPA attention layers")
@parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
@require_torch_sdpa
@slow
@@ -1792,8 +1909,6 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype)
model_sdpa = model_sdpa.eval().to(torch_device)
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
model_eager = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch_dtype,
@@ -1801,20 +1916,6 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
)
model_eager = model_eager.eval().to(torch_device)
self.assertTrue(model_eager.config._attn_implementation == "eager")
for name, submodule in model_eager.named_modules():
if "SdpaAttention" in submodule.__class__.__name__:
raise ValueError("The eager model should not have SDPA attention layers")
has_sdpa = False
for name, submodule in model_sdpa.named_modules():
if "SdpaAttention" in submodule.__class__.__name__:
has_sdpa = True
break
if not has_sdpa and model_sdpa.config.model_type != "falcon":
raise ValueError("The SDPA model should have SDPA attention layers")
# We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 8 times the model,
# but it would be nicer to have an efficient way to use parameterized.expand
fail_cases = []

View File

@@ -311,7 +311,9 @@ class MusicgenMelodyDecoderTest(ModelTesterMixin, GenerationTesterMixin, unittes
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model_fa = model_class.from_pretrained(
tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
tmpdirname,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
)
model_fa.to(torch_device)
@@ -391,7 +393,9 @@ class MusicgenMelodyDecoderTest(ModelTesterMixin, GenerationTesterMixin, unittes
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model_fa = model_class.from_pretrained(
tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
tmpdirname,
torch_dtype=torch.bfloat16,
attn_implementation="flash_attention_2",
)
model_fa.to(torch_device)
@@ -454,148 +458,10 @@ class MusicgenMelodyDecoderTest(ModelTesterMixin, GenerationTesterMixin, unittes
assert torch.allclose(logits_fa[:-1], logits[:-1], atol=4e-2, rtol=4e-2)
@require_flash_attn
@require_torch_gpu
@mark.flash_attn_test
@slow
# Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_left_padding
def test_flash_attn_2_generate_left_padding(self):
# Ignore copy
for model_class in self.greedy_sample_model_classes:
if not model_class._supports_flash_attn_2:
self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
torch_device
)
dummy_input = inputs_dict[model.main_input_name]
if dummy_input.dtype in [torch.float32, torch.bfloat16]:
dummy_input = dummy_input.to(torch.float16)
dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
# make sure we do left padding
dummy_attention_mask[:, :-1] = 0
dummy_attention_mask[:, -1:] = 1
out = model.generate(
dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False
)
model = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
attn_implementation="flash_attention_2",
low_cpu_mem_usage=True,
).to(torch_device)
out_fa = model.generate(
dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False
)
self.assertTrue(torch.allclose(out, out_fa))
@require_flash_attn
@require_torch_gpu
@mark.flash_attn_test
@slow
# Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_padding_right
def test_flash_attn_2_generate_padding_right(self):
# Ignore copy
for model_class in self.greedy_sample_model_classes:
if not model_class._supports_flash_attn_2:
self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model = model_class.from_pretrained(tmpdirname, torch_dtype=torch.float16, low_cpu_mem_usage=True).to(
torch_device
)
dummy_input = inputs_dict[model.main_input_name]
if dummy_input.dtype in [torch.float32, torch.bfloat16]:
dummy_input = dummy_input.to(torch.float16)
dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
# make sure we do right padding
dummy_attention_mask[:, :-1] = 1
dummy_attention_mask[:, -1:] = 0
out = model.generate(
dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False
)
model = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
attn_implementation="flash_attention_2",
low_cpu_mem_usage=True,
).to(torch_device)
out_fa = model.generate(
dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=8, do_sample=False
)
self.assertTrue(torch.allclose(out, out_fa))
@require_flash_attn
@require_torch_gpu
@mark.flash_attn_test
@slow
# Copied from tests.models.musicgen.test_modeling_musicgen.MusicgenDecoderTest.test_flash_attn_2_generate_use_cache
def test_flash_attn_2_generate_use_cache(self):
max_new_tokens = 30
# Ignore copy
for model_class in self.greedy_sample_model_classes:
if not model_class._supports_flash_attn_2:
self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
dummy_input = inputs_dict[model_class.main_input_name]
if dummy_input.dtype in [torch.float32, torch.bfloat16]:
dummy_input = dummy_input.to(torch.float16)
# make sure that all models have enough positions for generation
if hasattr(config, "max_position_embeddings"):
config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
model = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
attn_implementation="flash_attention_2",
low_cpu_mem_usage=True,
).to(torch_device)
# Just test that a large cache works as expected
_ = model.generate(
dummy_input,
attention_mask=dummy_attention_mask,
max_new_tokens=max_new_tokens,
do_sample=False,
use_cache=True,
)
@parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
@require_torch_sdpa
@slow
# Copied from tests.models.musicgen.test_modeling_musicgen.MusicgenDecoderTest.test_eager_matches_sdpa_inference
# Copied from tests.test_modeling_common.ModelTesterMixin.test_eager_matches_sdpa_inference
def test_eager_matches_sdpa_inference(self, torch_dtype: str):
if not self.has_attentions:
self.skipTest(reason="Model architecture does not support attentions")
@@ -658,8 +524,6 @@ class MusicgenMelodyDecoderTest(ModelTesterMixin, GenerationTesterMixin, unittes
model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype)
model_sdpa = model_sdpa.eval().to(torch_device)
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
model_eager = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch_dtype,
@@ -667,20 +531,6 @@ class MusicgenMelodyDecoderTest(ModelTesterMixin, GenerationTesterMixin, unittes
)
model_eager = model_eager.eval().to(torch_device)
self.assertTrue(model_eager.config._attn_implementation == "eager")
for name, submodule in model_eager.named_modules():
if "SdpaAttention" in submodule.__class__.__name__:
raise ValueError("The eager model should not have SDPA attention layers")
has_sdpa = False
for name, submodule in model_sdpa.named_modules():
if "SdpaAttention" in submodule.__class__.__name__:
has_sdpa = True
break
if not has_sdpa and model_sdpa.config.model_type != "falcon":
raise ValueError("The SDPA model should have SDPA attention layers")
# We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 8 times the model,
# but it would be nicer to have an efficient way to use parameterized.expand
fail_cases = []
@@ -839,74 +689,6 @@ class MusicgenMelodyDecoderTest(ModelTesterMixin, GenerationTesterMixin, unittes
self.assertTrue(len(fail_cases) == 0, "\n".join(fail_cases))
@require_torch_sdpa
@slow
# Copied from tests.models.musicgen.test_modeling_musicgen.MusicgenDecoderTest.test_eager_matches_sdpa_generate
def test_eager_matches_sdpa_generate(self):
max_new_tokens = 30
# Ignore copy
for model_class in self.greedy_sample_model_classes:
if not model_class._supports_sdpa:
self.skipTest(f"{model_class.__name__} does not support SDPA")
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
dummy_input = inputs_dict[model_class.main_input_name]
if dummy_input.dtype in [torch.float32, torch.bfloat16]:
dummy_input = dummy_input.to(torch.float16)
# make sure that all models have enough positions for generation
if hasattr(config, "max_position_embeddings"):
config.max_position_embeddings = max_new_tokens + dummy_input.shape[1] + 1
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
dummy_attention_mask = inputs_dict.get("attention_mask", torch.ones_like(dummy_input))
model_sdpa = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
).to(torch_device)
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
model_eager = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
low_cpu_mem_usage=True,
attn_implementation="eager",
).to(torch_device)
self.assertTrue(model_eager.config._attn_implementation == "eager")
for name, submodule in model_eager.named_modules():
if "SdpaAttention" in submodule.__class__.__name__:
raise ValueError("The eager model should not have SDPA attention layers")
has_sdpa = False
for name, submodule in model_sdpa.named_modules():
if "SdpaAttention" in submodule.__class__.__name__:
has_sdpa = True
break
if not has_sdpa:
raise ValueError("The SDPA model should have SDPA attention layers")
# Just test that a large cache works as expected
res_eager = model_eager.generate(
dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
)
res_sdpa = model_sdpa.generate(
dummy_input, attention_mask=dummy_attention_mask, max_new_tokens=max_new_tokens, do_sample=False
)
self.assertTrue(torch.allclose(res_eager, res_sdpa))
def prepare_musicgen_melody_inputs_dict(
config,
@@ -1048,6 +830,7 @@ class MusicgenMelodyTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
# not to test torchscript as the model tester doesn't prepare `input_features` and `padding_mask`
# (and `torchscript` hates `None` values).
test_torchscript = False
_is_composite = True
def setUp(self):
self.model_tester = MusicgenMelodyTester(self)
@@ -1406,7 +1189,7 @@ class MusicgenMelodyTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
@require_torch_gpu
@mark.flash_attn_test
@slow
# Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_inference_equivalence
# Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_inference_equivalence
def test_flash_attn_2_inference_equivalence(self):
for model_class in self.all_model_classes:
if not model_class._supports_flash_attn_2:
@@ -1418,7 +1201,9 @@ class MusicgenMelodyTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model_fa = model_class.from_pretrained(
tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
tmpdirname,
torch_dtype=torch.bfloat16,
attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None},
)
model_fa.to(torch_device)
@@ -1491,7 +1276,88 @@ class MusicgenMelodyTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
@require_torch_gpu
@mark.flash_attn_test
@slow
# Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_inference_equivalence_right_padding
def test_flash_attn_2_conversion(self):
if not self.has_attentions:
self.skipTest(reason="Model architecture does not support attentions")
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
if not model_class._supports_flash_attn_2:
self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None},
).to(torch_device)
for _, module in model.named_modules():
if "FlashAttention" in module.__class__.__name__:
return
self.assertTrue(False, "FlashAttention2 modules not found in model")
@require_torch_sdpa
@require_torch_gpu
@slow
def test_sdpa_can_dispatch_on_flash(self):
if not self.has_attentions:
self.skipTest(reason="Model architecture does not support attentions")
torch.compiler.reset()
compute_capability = torch.cuda.get_device_capability()
major, _ = compute_capability
if not torch.version.cuda or major < 8:
self.skipTest(reason="This test requires an NVIDIA GPU with compute capability >= 8.0")
for model_class in self.all_model_classes:
if not model_class._supports_sdpa:
self.skipTest(f"{model_class.__name__} does not support SDPA")
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
inputs_dict = self._prepare_for_class(inputs_dict, model_class)
if config.model_type in ["llava", "llava_next", "vipllava", "video_llava"]:
self.skipTest(
reason="Llava-like models currently (transformers==4.39.1) requires an attention_mask input"
)
if config.model_type in ["paligemma"]:
self.skipTest(
"PaliGemma-like models currently (transformers==4.41.0) requires an attention_mask input"
)
if config.model_type in ["idefics", "idefics2", "idefics3"]:
self.skipTest(reason="Idefics currently (transformers==4.39.1) requires an image_attention_mask input")
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
attn_implementation={"decoder": "sdpa", "audio_encoder": None, "text_encoder": None},
)
model.to(torch_device)
inputs_dict.pop("attention_mask", None)
inputs_dict.pop("decoder_attention_mask", None)
for name, inp in inputs_dict.items():
if isinstance(inp, torch.Tensor) and inp.dtype in [torch.float32, torch.float16]:
inputs_dict[name] = inp.to(torch.float16)
with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
_ = model(**inputs_dict)
@require_flash_attn
@require_torch_gpu
@mark.flash_attn_test
@slow
# Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_inference_equivalence_right_padding
def test_flash_attn_2_inference_equivalence_right_padding(self):
for model_class in self.all_model_classes:
if not model_class._supports_flash_attn_2:
@@ -1503,7 +1369,9 @@ class MusicgenMelodyTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model_fa = model_class.from_pretrained(
tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
tmpdirname,
torch_dtype=torch.bfloat16,
attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None},
)
model_fa.to(torch_device)
@@ -1573,7 +1441,7 @@ class MusicgenMelodyTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
@require_torch_gpu
@mark.flash_attn_test
@slow
# Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_left_padding
# Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_left_padding
def test_flash_attn_2_generate_left_padding(self):
# Ignore copy
for model_class in self.greedy_sample_model_classes:
@@ -1608,7 +1476,7 @@ class MusicgenMelodyTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
model = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
attn_implementation="flash_attention_2",
attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None},
low_cpu_mem_usage=True,
).to(torch_device)
@@ -1622,7 +1490,7 @@ class MusicgenMelodyTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
@require_torch_gpu
@mark.flash_attn_test
@slow
# Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_padding_right
# Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_padding_right
def test_flash_attn_2_generate_padding_right(self):
# Ignore copy
for model_class in self.greedy_sample_model_classes:
@@ -1656,7 +1524,7 @@ class MusicgenMelodyTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
model = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
attn_implementation="flash_attention_2",
attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None},
low_cpu_mem_usage=True,
).to(torch_device)
@@ -1670,7 +1538,7 @@ class MusicgenMelodyTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
@require_torch_gpu
@mark.flash_attn_test
@slow
# Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_use_cache
# Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_use_cache
def test_flash_attn_2_generate_use_cache(self):
max_new_tokens = 30
@@ -1699,7 +1567,7 @@ class MusicgenMelodyTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
model = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch.float16,
attn_implementation="flash_attention_2",
attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None},
low_cpu_mem_usage=True,
).to(torch_device)
@@ -1712,6 +1580,53 @@ class MusicgenMelodyTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
use_cache=True,
)
@require_torch_sdpa
def test_sdpa_can_dispatch_composite_models(self):
if not self.has_attentions:
self.skipTest(reason="Model architecture does not support attentions")
if not self._is_composite:
self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
for model_class in self.all_model_classes:
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model_sdpa = model_class.from_pretrained(tmpdirname)
model_sdpa = model_sdpa.eval().to(torch_device)
audio_encoder_attn = "sdpa" if model.audio_encoder._supports_sdpa else "eager"
text_encoder_attn = "sdpa" if model.text_encoder._supports_sdpa else "eager"
decoder_attn = "sdpa" if model.decoder._supports_sdpa else "eager"
# `None` as it is the requested one which will be assigned to each sub-config
# Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present)
self.assertTrue(model_sdpa.audio_encoder.config._attn_implementation == audio_encoder_attn)
self.assertTrue(model_sdpa.text_encoder.config._attn_implementation == text_encoder_attn)
self.assertTrue(model_sdpa.decoder.config._attn_implementation == decoder_attn)
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager")
model_eager = model_eager.eval().to(torch_device)
self.assertTrue(model_eager.audio_encoder.config._attn_implementation == "eager")
self.assertTrue(model_eager.text_encoder.config._attn_implementation == "eager")
self.assertTrue(model_eager.decoder.config._attn_implementation == "eager")
self.assertTrue(model_eager.config._attn_implementation == "eager")
for name, submodule in model_eager.named_modules():
if "SdpaAttention" in submodule.__class__.__name__:
raise ValueError("The eager model should not have SDPA attention layers")
has_sdpa = False
for name, submodule in model_sdpa.named_modules():
if "SdpaAttention" in submodule.__class__.__name__:
has_sdpa = True
break
if not has_sdpa and model_sdpa.config.model_type != "falcon":
raise ValueError("The SDPA model should have SDPA attention layers")
@parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
@require_torch_sdpa
@slow
@@ -1775,8 +1690,6 @@ class MusicgenMelodyTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype)
model_sdpa = model_sdpa.eval().to(torch_device)
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
model_eager = model_class.from_pretrained(
tmpdirname,
torch_dtype=torch_dtype,
@@ -1784,20 +1697,6 @@ class MusicgenMelodyTest(ModelTesterMixin, GenerationTesterMixin, PipelineTester
)
model_eager = model_eager.eval().to(torch_device)
self.assertTrue(model_eager.config._attn_implementation == "eager")
for name, submodule in model_eager.named_modules():
if "SdpaAttention" in submodule.__class__.__name__:
raise ValueError("The eager model should not have SDPA attention layers")
has_sdpa = False
for name, submodule in model_sdpa.named_modules():
if "SdpaAttention" in submodule.__class__.__name__:
has_sdpa = True
break
if not has_sdpa and model_sdpa.config.model_type != "falcon":
raise ValueError("The SDPA model should have SDPA attention layers")
# We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 8 times the model,
# but it would be nicer to have an efficient way to use parameterized.expand
fail_cases = []

View File

@@ -187,6 +187,7 @@ class PaliGemmaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
test_pruning = False
test_torchscript = False
test_head_masking = False
_is_composite = True
def setUp(self):
self.model_tester = PaliGemmaVisionText2TextModelTester(self)
@@ -319,6 +320,16 @@ class PaliGemmaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTes
def test_static_cache_matches_dynamic(self):
pass
@unittest.skip("FlashAttention only support fp16 and bf16 data type")
def test_flash_attn_2_fp32_ln(self):
pass
@unittest.skip(
"VLMs need lots of steps to prepare images/mask correctly to get pad-free inputs. Can be tested as part of LLM test"
)
def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
pass
@slow
@require_torch

View File

@@ -15,6 +15,7 @@
"""Testing suite for the PyTorch Qwen2Audio model."""
import gc
import tempfile
import unittest
from io import BytesIO
from urllib.request import urlopen
@@ -29,6 +30,7 @@ from transformers import (
)
from transformers.testing_utils import (
require_torch,
require_torch_sdpa,
slow,
torch_device,
)
@@ -152,6 +154,7 @@ class Qwen2AudioForConditionalGenerationModelTest(ModelTesterMixin, unittest.Tes
all_model_classes = (Qwen2AudioForConditionalGeneration,) if is_torch_available() else ()
test_pruning = False
test_head_masking = False
_is_composite = True
def setUp(self):
self.model_tester = Qwen2AudioModelTester(self)
@@ -165,6 +168,53 @@ class Qwen2AudioForConditionalGenerationModelTest(ModelTesterMixin, unittest.Tes
def test_sdpa_can_dispatch_on_flash(self):
pass
@require_torch_sdpa
def test_sdpa_can_dispatch_composite_models(self):
# overwrite because Qwen2 is audio+text model (not vision+text)
if not self.has_attentions:
self.skipTest(reason="Model architecture does not support attentions")
if not self._is_composite:
self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
for model_class in self.all_model_classes:
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model_sdpa = model_class.from_pretrained(tmpdirname)
model_sdpa = model_sdpa.eval().to(torch_device)
text_attn = "sdpa" if model.language_model._supports_sdpa else "eager"
vision_attn = "sdpa" if model.audio_tower._supports_sdpa else "eager"
# `None` as it is the requested one which will be assigned to each sub-config
# Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present)
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
self.assertTrue(model.language_model.config._attn_implementation == text_attn)
self.assertTrue(model.audio_tower.config._attn_implementation == vision_attn)
model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager")
model_eager = model_eager.eval().to(torch_device)
self.assertTrue(model_eager.config._attn_implementation == "eager")
self.assertTrue(model_eager.language_model.config._attn_implementation == "eager")
self.assertTrue(model_eager.audio_tower.config._attn_implementation == "eager")
for name, submodule in model_eager.named_modules():
class_name = submodule.__class__.__name__
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
raise ValueError("The eager model should not have SDPA attention layers")
has_sdpa = False
for name, submodule in model_sdpa.named_modules():
class_name = submodule.__class__.__name__
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
has_sdpa = True
break
if not has_sdpa and model_sdpa.config.model_type != "falcon":
raise ValueError("The SDPA model should have SDPA attention layers")
@require_torch
class Qwen2AudioForConditionalGenerationIntegrationTest(unittest.TestCase):

View File

@@ -71,6 +71,51 @@ if is_vision_available():
class SiglipModelTesterMixin(ModelTesterMixin):
def test_sdpa_can_dispatch_composite_models(self):
for model_class in self.all_model_classes:
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
# Load the model with SDPA
model_sdpa = model_class.from_pretrained(tmpdirname)
model_sdpa = model_sdpa.eval().to(torch_device)
# Load model with eager attention
model_eager = model_class.from_pretrained(
tmpdirname,
attn_implementation="eager",
)
model_eager = model_eager.eval().to(torch_device)
# SigLip has one shared cls attr for all models, so we assign both submodels heer
vision_attn = text_attn = "sdpa" if model._supports_sdpa else "eager"
if hasattr(model_sdpa, "vision_model") and hasattr(model_sdpa, "text_model"):
self.assertTrue(model_sdpa.vision_model.config._attn_implementation == vision_attn)
self.assertTrue(model_sdpa.text_model.config._attn_implementation == text_attn)
self.assertTrue(model_eager.vision_model.config._attn_implementation == "eager")
self.assertTrue(model_eager.text_model.config._attn_implementation == "eager")
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
self.assertTrue(model_eager.config._attn_implementation == "eager")
for name, submodule in model_eager.named_modules():
class_name = submodule.__class__.__name__
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
raise ValueError("The eager model should not have SDPA attention layers")
has_sdpa = False
for name, submodule in model_sdpa.named_modules():
class_name = submodule.__class__.__name__
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
has_sdpa = True
break
if not has_sdpa and model_sdpa.config.model_type != "falcon":
raise ValueError("The SDPA model should have SDPA attention layers")
def test_eager_matches_sdpa_inference(
self,
torch_dtype: str,
@@ -132,23 +177,6 @@ class SiglipModelTesterMixin(ModelTesterMixin):
)
model_eager = model_eager.eval().to(torch_device)
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
self.assertTrue(model_eager.config._attn_implementation == "eager")
for name, submodule in model_eager.named_modules():
class_name = submodule.__class__.__name__
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
raise ValueError("The eager model should not have SDPA attention layers")
has_sdpa = False
for name, submodule in model_sdpa.named_modules():
class_name = submodule.__class__.__name__
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
has_sdpa = True
break
if not has_sdpa and model_sdpa.config.model_type != "falcon":
raise ValueError("The SDPA model should have SDPA attention layers")
# We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving the model each time,
# but it would be nicer to have an efficient way to use parameterized.expand
cases = [
@@ -400,6 +428,10 @@ class SiglipVisionModelTest(SiglipModelTesterMixin, unittest.TestCase):
use_attention_mask_options=(False,),
)
@require_torch_sdpa
def test_sdpa_can_dispatch_composite_models(self):
super().test_sdpa_can_dispatch_composite_models()
class SiglipTextModelTester:
def __init__(
@@ -562,6 +594,10 @@ class SiglipTextModelTest(SiglipModelTesterMixin, unittest.TestCase):
use_attention_mask_options=(False, True),
)
@require_torch_sdpa
def test_sdpa_can_dispatch_composite_models(self):
super().test_sdpa_can_dispatch_composite_models()
class SiglipModelTester:
def __init__(self, parent, text_kwargs=None, vision_kwargs=None, is_training=True):
@@ -629,6 +665,7 @@ class SiglipModelTest(SiglipModelTesterMixin, PipelineTesterMixin, unittest.Test
test_cpu_offload = False
test_disk_offload_safetensors = False
test_disk_offload_bin = False
_is_composite = True
# Copied from tests.models.clip.test_modeling_clip.CLIPModelTest.setUp with CLIP->Siglip
def setUp(self):
@@ -851,6 +888,10 @@ class SiglipModelTest(SiglipModelTesterMixin, PipelineTesterMixin, unittest.Test
use_attention_mask_options=(False, True),
)
@require_torch_sdpa
def test_sdpa_can_dispatch_composite_models(self):
super().test_sdpa_can_dispatch_composite_models()
class SiglipForImageClassificationModelTester(SiglipModelTester):
def __init__(self, parent):
@@ -888,6 +929,7 @@ class SiglipForImageClassificationModelTest(SiglipModelTesterMixin, PipelineTest
test_cpu_offload = False
test_disk_offload_safetensors = False
test_disk_offload_bin = False
_is_composite = True
def setUp(self):
self.model_tester = SiglipForImageClassificationModelTester(self)
@@ -925,6 +967,10 @@ class SiglipForImageClassificationModelTest(SiglipModelTesterMixin, PipelineTest
torch_dtype=torch_dtype, logit_keys=("logits",), use_attention_mask_options=(False,)
)
@require_torch_sdpa
def test_sdpa_can_dispatch_composite_models(self):
super().test_sdpa_can_dispatch_composite_models()
# We will verify our results on an image of cute cats
def prepare_img():

View File

@@ -18,7 +18,13 @@ import tempfile
import unittest
from transformers import is_torch_available
from transformers.testing_utils import require_deterministic_for_xpu, require_torch, slow, torch_device
from transformers.testing_utils import (
require_deterministic_for_xpu,
require_torch,
require_torch_sdpa,
slow,
torch_device,
)
from ...test_modeling_common import floats_tensor, ids_tensor, random_attention_mask
from ..bert.test_modeling_bert import BertModelTester
@@ -441,6 +447,66 @@ class EncoderDecoderMixin:
max_diff = np.amax(np.abs(out_1 - out_2))
self.assertLessEqual(max_diff, 1e-5)
@require_torch_sdpa
def test_sdpa_can_dispatch_composite_models(self):
inputs_dict = self.prepare_config_and_inputs()
encoder_config, decoder_config = inputs_dict["config"], inputs_dict["decoder_config"]
config = SpeechEncoderDecoderConfig.from_encoder_decoder_configs(
encoder_config=encoder_config, decoder_config=decoder_config
)
model = SpeechEncoderDecoderModel(config=config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model_sdpa = SpeechEncoderDecoderModel.from_pretrained(tmpdirname)
model_sdpa = model_sdpa.eval().to(torch_device)
# see https://github.com/huggingface/transformers/pull/32238
# Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present)
encoder_attn = "sdpa" if model.encoder._supports_sdpa else "eager"
decoder_attn = "sdpa" if model.decoder._supports_sdpa else "eager"
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
self.assertTrue(model_sdpa.encoder.config._attn_implementation == encoder_attn)
self.assertTrue(model_sdpa.decoder.config._attn_implementation == decoder_attn)
# Also test that nothing break if we request SDPA explicitly, when both sub-parts support it.
# If the model supports sdpa (i.e. all of sub-models supports it) we'll dispatch safely
# Otherwise we should raise error that SDPA is not supported, as some of the sub-models doesn't support
if encoder_attn == "sdpa" and decoder_attn == "sdpa":
model_sdpa_explicit = SpeechEncoderDecoderModel.from_pretrained(tmpdirname, attn_implementation="sdpa")
model_sdpa_explicit = model_sdpa_explicit.eval().to(torch_device)
self.assertTrue(model_sdpa_explicit.config._attn_implementation == "sdpa")
else:
with self.assertRaises(ValueError):
model_sdpa_explicit = SpeechEncoderDecoderModel.from_pretrained(
tmpdirname, attn_implementation="sdpa"
)
model_eager = SpeechEncoderDecoderModel.from_pretrained(
tmpdirname,
attn_implementation="eager",
)
model_eager = model_eager.eval().to(torch_device)
self.assertTrue(model_eager.config._attn_implementation == "eager")
self.assertTrue(model_eager.encoder.config._attn_implementation == "eager")
self.assertTrue(model_eager.decoder.config._attn_implementation == "eager")
for name, submodule in model_eager.named_modules():
class_name = submodule.__class__.__name__
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
raise ValueError("The eager model should not have SDPA attention layers")
has_sdpa = False
for name, submodule in model_sdpa.named_modules():
class_name = submodule.__class__.__name__
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
has_sdpa = True
break
if not has_sdpa:
raise ValueError("The SDPA model should have SDPA attention layers")
@require_torch
class Wav2Vec2BertModelTest(EncoderDecoderMixin, unittest.TestCase):

View File

@@ -206,6 +206,7 @@ class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe
test_pruning = False
test_resize_embeddings = True
test_head_masking = False
_is_composite = True
def setUp(self):
self.model_tester = VideoLlavaVisionText2TextModelTester(self)
@@ -237,6 +238,16 @@ class VideoLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTe
def test_sdpa_can_dispatch_on_flash(self):
pass
@unittest.skip("FlashAttention only support fp16 and bf16 data type")
def test_flash_attn_2_fp32_ln(self):
pass
@unittest.skip(
"VLMs need lots of steps to prepare images/mask correctly to get pad-free inputs. Can be tested as part of LLM test"
)
def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
pass
@unittest.skip(
reason="After #33533, this still passes, but many subsequential tests fail with `device-side assert triggered`"
)

View File

@@ -168,6 +168,7 @@ class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTest
test_pruning = False
test_resize_embeddings = True
test_head_masking = False
_is_composite = True
def setUp(self):
self.model_tester = VipLlavaVisionText2TextModelTester(self)
@@ -242,6 +243,16 @@ class VipLlavaForConditionalGenerationModelTest(ModelTesterMixin, GenerationTest
def test_sdpa_can_dispatch_on_flash(self):
pass
@unittest.skip("FlashAttention only support fp16 and bf16 data type")
def test_flash_attn_2_fp32_ln(self):
pass
@unittest.skip(
"VLMs need lots of steps to prepare images/mask correctly to get pad-free inputs. Can be tested as part of LLM test"
)
def test_flash_attention_2_padding_matches_padding_free_with_position_ids(self):
pass
@require_torch
class VipLlavaForConditionalGenerationIntegrationTest(unittest.TestCase):

View File

@@ -27,17 +27,24 @@ from transformers.testing_utils import (
require_nltk,
require_sentencepiece,
require_torch,
require_torch_sdpa,
require_vision,
slow,
to_2tuple,
torch_device,
)
from transformers.utils import cached_property, is_torch_available, is_vision_available
from transformers.utils import (
cached_property,
is_torch_available,
is_vision_available,
)
from ...test_modeling_common import floats_tensor, ids_tensor, random_attention_mask
from ..bart.test_modeling_bart import BartModelTester
from ..bert.test_modeling_bert import BertModelTester
from ..deit.test_modeling_deit import DeiTModelTester
from ..donut.test_modeling_donut_swin import DonutSwinModelTester
from ..gpt2.test_modeling_gpt2 import GPT2ModelTester
from ..layoutlmv3.test_modeling_layoutlmv3 import LayoutLMv3ModelTester
from ..swin.test_modeling_swin import SwinModelTester
from ..trocr.test_modeling_trocr import TrOCRStandaloneDecoderModelTester
@@ -53,6 +60,8 @@ if is_torch_available():
BartForCausalLM,
BertLMHeadModel,
DeiTModel,
DonutSwinModel,
GPT2LMHeadModel,
LayoutLMv3Model,
SwinModel,
TrOCRForCausalLM,
@@ -72,6 +81,8 @@ if is_vision_available():
@require_torch
class EncoderDecoderMixin:
supports_sdpa = False
def get_encoder_decoder_model(self, config, decoder_config):
pass
@@ -374,6 +385,69 @@ class EncoderDecoderMixin:
max_diff = np.amax(np.abs(out_1 - out_2))
self.assertLessEqual(max_diff, 1e-5)
@require_torch_sdpa
def test_sdpa_can_dispatch_composite_models(self):
if not self.supports_sdpa:
self.skipTest("SDPA is not supported")
inputs_dict = self.prepare_config_and_inputs()
encoder_config, decoder_config = inputs_dict["config"], inputs_dict["decoder_config"]
config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(
encoder_config=encoder_config, decoder_config=decoder_config
)
model = VisionEncoderDecoderModel(config=config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model_sdpa = VisionEncoderDecoderModel.from_pretrained(tmpdirname)
model_sdpa = model_sdpa.eval().to(torch_device)
# see https://github.com/huggingface/transformers/pull/32238
# Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present)
encoder_attn = "sdpa" if model.encoder._supports_sdpa else "eager"
decoder_attn = "sdpa" if model.decoder._supports_sdpa else "eager"
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
self.assertTrue(model_sdpa.encoder.config._attn_implementation == encoder_attn)
self.assertTrue(model_sdpa.decoder.config._attn_implementation == decoder_attn)
# Also test that nothing break if we request SDPA explicitly, when both sub-parts support it.
# If the model supports sdpa (i.e. all of sub-models supports it) we'll dispatch safely
# Otherwise we should raise error that SDPA is not supported, as some of the sub-models doesn't support
if encoder_attn == "sdpa" and decoder_attn == "sdpa":
model_sdpa_explicit = VisionEncoderDecoderModel.from_pretrained(tmpdirname, attn_implementation="sdpa")
model_sdpa_explicit = model_sdpa_explicit.eval().to(torch_device)
self.assertTrue(model_sdpa_explicit.config._attn_implementation == "sdpa")
else:
with self.assertRaises(ValueError):
model_sdpa_explicit = VisionEncoderDecoderModel.from_pretrained(
tmpdirname, attn_implementation="sdpa"
)
model_eager = VisionEncoderDecoderModel.from_pretrained(
tmpdirname,
attn_implementation="eager",
)
model_eager = model_eager.eval().to(torch_device)
self.assertTrue(model_eager.config._attn_implementation == "eager")
self.assertTrue(model_eager.encoder.config._attn_implementation == "eager")
self.assertTrue(model_eager.decoder.config._attn_implementation == "eager")
for name, submodule in model_eager.named_modules():
class_name = submodule.__class__.__name__
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
raise ValueError("The eager model should not have SDPA attention layers")
has_sdpa = False
for name, submodule in model_sdpa.named_modules():
class_name = submodule.__class__.__name__
if "SdpaAttention" in class_name or "SdpaSelfAttention" in class_name:
has_sdpa = True
break
if not has_sdpa:
raise ValueError("The SDPA model should have SDPA attention layers")
@require_torch
class DeiT2RobertaModelTest(EncoderDecoderMixin, unittest.TestCase):
@@ -497,6 +571,8 @@ class DeiT2RobertaModelTest(EncoderDecoderMixin, unittest.TestCase):
@require_torch
class ViT2BertModelTest(EncoderDecoderMixin, unittest.TestCase):
supports_sdpa = True # one submodel support SDPA
def get_pretrained_model_and_inputs(self):
model = VisionEncoderDecoderModel.from_encoder_decoder_pretrained(
"hf-internal-testing/tiny-random-vit", "hf-internal-testing/tiny-bert"
@@ -649,6 +725,8 @@ class Swin2BartModelTest(EncoderDecoderMixin, unittest.TestCase):
@require_torch
class ViT2TrOCR(EncoderDecoderMixin, unittest.TestCase):
supports_sdpa = True # one submodel support SDPA
def get_encoder_decoder_model(self, config, decoder_config):
encoder_model = ViTModel(config).eval()
decoder_model = TrOCRForCausalLM(decoder_config).eval()
@@ -804,6 +882,240 @@ class LayoutLMv32TrOCR(EncoderDecoderMixin, unittest.TestCase):
pass
@require_torch
class VIT2GPT2Test(EncoderDecoderMixin, unittest.TestCase):
supports_sdpa = True # both submodels support SDPA
def get_encoder_decoder_model(self, config, decoder_config):
encoder_model = ViTModel(config).eval()
decoder_model = GPT2LMHeadModel(decoder_config).eval()
return encoder_model, decoder_model
def prepare_config_and_inputs(self):
model_tester_encoder = ViTModelTester(self, batch_size=13)
model_tester_decoder = GPT2ModelTester(self, batch_size=13, hidden_size=32, max_position_embeddings=512)
encoder_config_and_inputs = model_tester_encoder.prepare_config_and_inputs()
decoder_config_and_inputs = model_tester_decoder.prepare_config_and_inputs()
config, pixel_values, labels = encoder_config_and_inputs
(
decoder_config,
decoder_input_ids,
decoder_attention_mask,
decoder_head_mask,
decoder_token_type_ids,
mc_token_ids,
sequence_labels,
token_labels,
choice_labels,
) = decoder_config_and_inputs
# make sure that cross attention layers are added
decoder_config.add_cross_attention = True
# disable cache for now
decoder_config.use_cache = False
return {
"config": config,
"pixel_values": pixel_values,
"decoder_config": decoder_config,
"decoder_input_ids": decoder_input_ids,
"decoder_attention_mask": decoder_attention_mask,
"decoder_head_mask": decoder_head_mask,
"labels": decoder_input_ids,
}
def check_encoder_decoder_model_output_attentions(
self,
config,
decoder_config,
decoder_input_ids,
decoder_attention_mask,
pixel_values,
labels=None,
**kwargs,
):
# make the decoder inputs a different shape from the encoder inputs to harden the test
decoder_input_ids = decoder_input_ids[:, :-1]
decoder_attention_mask = decoder_attention_mask[:, :-1]
encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
enc_dec_model = VisionEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
enc_dec_model.to(torch_device)
outputs_encoder_decoder = enc_dec_model(
pixel_values=pixel_values,
decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask,
output_attentions=True,
**kwargs,
)
encoder_attentions = outputs_encoder_decoder["encoder_attentions"]
self.assertEqual(len(encoder_attentions), config.num_hidden_layers)
seq_len = (encoder_model.config.image_size // encoder_model.config.patch_size) ** 2 + 1
decoder_attentions = outputs_encoder_decoder["decoder_attentions"]
num_decoder_layers = (
decoder_config.num_decoder_layers
if hasattr(decoder_config, "num_decoder_layers")
else decoder_config.num_hidden_layers
)
self.assertEqual(len(decoder_attentions), num_decoder_layers)
self.assertEqual(
decoder_attentions[0].shape[-3:],
(decoder_config.num_attention_heads, decoder_input_ids.shape[-1], decoder_input_ids.shape[-1]),
)
cross_attentions = outputs_encoder_decoder["cross_attentions"]
self.assertEqual(len(cross_attentions), num_decoder_layers)
cross_attention_input_seq_len = decoder_input_ids.shape[-1]
self.assertEqual(
cross_attentions[0].shape[-3:],
(decoder_config.num_attention_heads, cross_attention_input_seq_len, seq_len), # 4 6 16
)
def check_encoder_decoder_model_generate(self, config, decoder_config, pixel_values=None, **kwargs):
encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
enc_dec_model = VisionEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
# Generate until max length
if hasattr(enc_dec_model.config, "eos_token_id"):
enc_dec_model.config.eos_token_id = None
if hasattr(enc_dec_model.config, "decoder") and hasattr(enc_dec_model.config.decoder, "eos_token_id"):
enc_dec_model.config.decoder.eos_token_id = None
if hasattr(enc_dec_model.generation_config, "eos_token_id"):
enc_dec_model.generation_config.eos_token_id = None
enc_dec_model.to(torch_device)
generated_output = enc_dec_model.generate(
pixel_values=pixel_values,
decoder_start_token_id=enc_dec_model.config.decoder.bos_token_id,
**kwargs,
)
self.assertEqual(generated_output.shape, (pixel_values.shape[0],) + (decoder_config.max_length,))
@unittest.skip(reason="VIT2GPT2 also has an integration test for testinf save-load")
def test_real_model_save_load_from_pretrained(self):
pass
@require_torch
class Donut2GPT2Test(EncoderDecoderMixin, unittest.TestCase):
supports_sdpa = True # one submodel (GPT2) support SDPA
def get_encoder_decoder_model(self, config, decoder_config):
encoder_model = DonutSwinModel(config).eval()
decoder_model = GPT2LMHeadModel(decoder_config).eval()
return encoder_model, decoder_model
def prepare_config_and_inputs(self):
model_tester_encoder = DonutSwinModelTester(self, batch_size=13)
model_tester_decoder = GPT2ModelTester(self, batch_size=13, hidden_size=32, max_position_embeddings=512)
encoder_config_and_inputs = model_tester_encoder.prepare_config_and_inputs()
decoder_config_and_inputs = model_tester_decoder.prepare_config_and_inputs()
config, pixel_values, labels = encoder_config_and_inputs
(
decoder_config,
decoder_input_ids,
decoder_attention_mask,
decoder_head_mask,
decoder_token_type_ids,
mc_token_ids,
sequence_labels,
token_labels,
choice_labels,
) = decoder_config_and_inputs
# make sure that cross attention layers are added
decoder_config.add_cross_attention = True
# disable cache for now
decoder_config.use_cache = False
return {
"config": config,
"pixel_values": pixel_values,
"decoder_config": decoder_config,
"decoder_input_ids": decoder_input_ids,
"decoder_attention_mask": decoder_attention_mask,
"decoder_head_mask": decoder_head_mask,
"labels": decoder_input_ids,
}
def check_encoder_decoder_model_output_attentions(
self,
config,
decoder_config,
decoder_input_ids,
decoder_attention_mask,
pixel_values,
labels=None,
**kwargs,
):
# make the decoder inputs a different shape from the encoder inputs to harden the test
decoder_input_ids = decoder_input_ids[:, :-1]
decoder_attention_mask = decoder_attention_mask[:, :-1]
encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
enc_dec_model = VisionEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
enc_dec_model.to(torch_device)
outputs_encoder_decoder = enc_dec_model(
pixel_values=pixel_values,
decoder_input_ids=decoder_input_ids,
decoder_attention_mask=decoder_attention_mask,
output_attentions=True,
**kwargs,
)
encoder_attentions = outputs_encoder_decoder["encoder_attentions"]
self.assertEqual(len(encoder_attentions), config.num_hidden_layers)
seq_len = encoder_model.config.image_size // encoder_model.config.patch_size
decoder_attentions = outputs_encoder_decoder["decoder_attentions"]
num_decoder_layers = (
decoder_config.num_decoder_layers
if hasattr(decoder_config, "num_decoder_layers")
else decoder_config.num_hidden_layers
)
self.assertEqual(len(decoder_attentions), num_decoder_layers)
self.assertEqual(
decoder_attentions[0].shape[-3:],
(decoder_config.num_attention_heads, decoder_input_ids.shape[-1], decoder_input_ids.shape[-1]),
)
cross_attentions = outputs_encoder_decoder["cross_attentions"]
self.assertEqual(len(cross_attentions), num_decoder_layers)
cross_attention_input_seq_len = decoder_input_ids.shape[-1]
self.assertEqual(
cross_attentions[0].shape[-3:],
(decoder_config.num_attention_heads, cross_attention_input_seq_len, seq_len), # 4 6 16
)
def check_encoder_decoder_model_generate(self, config, decoder_config, pixel_values=None, **kwargs):
encoder_model, decoder_model = self.get_encoder_decoder_model(config, decoder_config)
enc_dec_model = VisionEncoderDecoderModel(encoder=encoder_model, decoder=decoder_model)
# Generate until max length
if hasattr(enc_dec_model.config, "eos_token_id"):
enc_dec_model.config.eos_token_id = None
if hasattr(enc_dec_model.config, "decoder") and hasattr(enc_dec_model.config.decoder, "eos_token_id"):
enc_dec_model.config.decoder.eos_token_id = None
if hasattr(enc_dec_model.generation_config, "eos_token_id"):
enc_dec_model.generation_config.eos_token_id = None
enc_dec_model.to(torch_device)
generated_output = enc_dec_model.generate(
pixel_values=pixel_values,
decoder_start_token_id=enc_dec_model.config.decoder.bos_token_id,
**kwargs,
)
self.assertEqual(generated_output.shape, (pixel_values.shape[0],) + (decoder_config.max_length,))
@unittest.skip(reason="Donut has an Integration test for that")
def test_real_model_save_load_from_pretrained(self):
pass
@require_vision
@require_torch
class TrOCRModelIntegrationTest(unittest.TestCase):