Attn implementation for composite models (#32238)
* first try * codestyle * idefics2 is happy * [run-slow] llava, llava_next, video_llava, vipllava, llava_next_video, idefics, idefics2, kosmos2, fuyu, blip, blip_2, instructblip, instructblipvideo, paligemma * fix-copies * [run-slow] llava, llava_next, video_llava, vipllava, llava_next_video, idefics, idefics2, kosmos2, fuyu, blip, blip_2, instructblip, instructblipvideo * blip-2 needs to init vision from config * when was this removed O_o * minor fix * tests * this way? * tests * model-agnostic code * codestyle * add tests for idefics * modify general test for VLMs * no generation test for vlm yet! * no generation test here also * wanr in VIT-SDPA if output attn * add more tests * user can pass dict as attn impl * repo consistency * update * muicgen * no prints * forgot speech enc-dec and clip * how many composite models we have? * musicgen meelody is same as mudicgen * +siglip * fix tests + add some more * remove idefics custom overriden code * make idefics2 automappable * nits * skip tests * doctests * Update src/transformers/models/idefics2/configuration_idefics2.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/models/clip/test_modeling_clip.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/models/idefics2/test_modeling_idefics2.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update tests/models/idefics2/test_modeling_idefics2.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * Update src/transformers/configuration_utils.py Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com> * major update, no need for automap * clean up * add FA2 test * more tests * style * skip tests * why did these started failing now? * no attributes for FA2 needed * one tiny test * address comment about FA2 false warning * style * add new models and resolve conflicts * fix copies * let it be this way for now, come back tomorrow to review * some more fixes * update * more updates * update * fix copies * style and tests * another big update * fix tests * fix tests * update * another update * fix tests * fix copies * fix tests --------- Co-authored-by: amyeroberts <22614925+amyeroberts@users.noreply.github.com>
This commit is contained in:
committed by
GitHub
parent
32590b5ecb
commit
21d5025826
@@ -654,8 +654,6 @@ class MusicgenDecoderTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
|
||||
model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype)
|
||||
model_sdpa = model_sdpa.eval().to(torch_device)
|
||||
|
||||
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
|
||||
|
||||
model_eager = model_class.from_pretrained(
|
||||
tmpdirname,
|
||||
torch_dtype=torch_dtype,
|
||||
@@ -663,20 +661,6 @@ class MusicgenDecoderTest(ModelTesterMixin, GenerationTesterMixin, PipelineTeste
|
||||
)
|
||||
model_eager = model_eager.eval().to(torch_device)
|
||||
|
||||
self.assertTrue(model_eager.config._attn_implementation == "eager")
|
||||
|
||||
for name, submodule in model_eager.named_modules():
|
||||
if "SdpaAttention" in submodule.__class__.__name__:
|
||||
raise ValueError("The eager model should not have SDPA attention layers")
|
||||
|
||||
has_sdpa = False
|
||||
for name, submodule in model_sdpa.named_modules():
|
||||
if "SdpaAttention" in submodule.__class__.__name__:
|
||||
has_sdpa = True
|
||||
break
|
||||
if not has_sdpa and model_sdpa.config.model_type != "falcon":
|
||||
raise ValueError("The SDPA model should have SDPA attention layers")
|
||||
|
||||
# We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 8 times the model,
|
||||
# but it would be nicer to have an efficient way to use parameterized.expand
|
||||
fail_cases = []
|
||||
@@ -1042,6 +1026,7 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
|
||||
# not to test torchscript as the model tester doesn't prepare `input_values` and `padding_mask`
|
||||
# (and `torchscript` hates `None` values).
|
||||
test_torchscript = False
|
||||
_is_composite = True
|
||||
|
||||
def setUp(self):
|
||||
self.model_tester = MusicgenTester(self)
|
||||
@@ -1420,7 +1405,7 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
|
||||
@require_torch_gpu
|
||||
@mark.flash_attn_test
|
||||
@slow
|
||||
# Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_inference_equivalence
|
||||
# Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_inference_equivalence
|
||||
def test_flash_attn_2_inference_equivalence(self):
|
||||
for model_class in self.all_model_classes:
|
||||
if not model_class._supports_flash_attn_2:
|
||||
@@ -1432,7 +1417,9 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
model.save_pretrained(tmpdirname)
|
||||
model_fa = model_class.from_pretrained(
|
||||
tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
|
||||
tmpdirname,
|
||||
torch_dtype=torch.bfloat16,
|
||||
attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None},
|
||||
)
|
||||
model_fa.to(torch_device)
|
||||
|
||||
@@ -1505,7 +1492,88 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
|
||||
@require_torch_gpu
|
||||
@mark.flash_attn_test
|
||||
@slow
|
||||
# Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_inference_equivalence_right_padding
|
||||
def test_flash_attn_2_conversion(self):
|
||||
if not self.has_attentions:
|
||||
self.skipTest(reason="Model architecture does not support attentions")
|
||||
|
||||
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
if not model_class._supports_flash_attn_2:
|
||||
self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
|
||||
|
||||
model = model_class(config)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
model.save_pretrained(tmpdirname)
|
||||
model = model_class.from_pretrained(
|
||||
tmpdirname,
|
||||
torch_dtype=torch.float16,
|
||||
attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None},
|
||||
).to(torch_device)
|
||||
|
||||
for _, module in model.named_modules():
|
||||
if "FlashAttention" in module.__class__.__name__:
|
||||
return
|
||||
|
||||
self.assertTrue(False, "FlashAttention2 modules not found in model")
|
||||
|
||||
@require_torch_sdpa
|
||||
@require_torch_gpu
|
||||
@slow
|
||||
def test_sdpa_can_dispatch_on_flash(self):
|
||||
if not self.has_attentions:
|
||||
self.skipTest(reason="Model architecture does not support attentions")
|
||||
|
||||
torch.compiler.reset()
|
||||
compute_capability = torch.cuda.get_device_capability()
|
||||
major, _ = compute_capability
|
||||
|
||||
if not torch.version.cuda or major < 8:
|
||||
self.skipTest(reason="This test requires an NVIDIA GPU with compute capability >= 8.0")
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
if not model_class._supports_sdpa:
|
||||
self.skipTest(f"{model_class.__name__} does not support SDPA")
|
||||
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
inputs_dict = self._prepare_for_class(inputs_dict, model_class)
|
||||
if config.model_type in ["llava", "llava_next", "vipllava", "video_llava"]:
|
||||
self.skipTest(
|
||||
reason="Llava-like models currently (transformers==4.39.1) requires an attention_mask input"
|
||||
)
|
||||
if config.model_type in ["paligemma"]:
|
||||
self.skipTest(
|
||||
"PaliGemma-like models currently (transformers==4.41.0) requires an attention_mask input"
|
||||
)
|
||||
if config.model_type in ["idefics", "idefics2", "idefics3"]:
|
||||
self.skipTest(reason="Idefics currently (transformers==4.39.1) requires an image_attention_mask input")
|
||||
model = model_class(config)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
model.save_pretrained(tmpdirname)
|
||||
model = model_class.from_pretrained(
|
||||
tmpdirname,
|
||||
torch_dtype=torch.float16,
|
||||
attn_implementation={"decoder": "sdpa", "audio_encoder": None, "text_encoder": None},
|
||||
)
|
||||
model.to(torch_device)
|
||||
|
||||
inputs_dict.pop("attention_mask", None)
|
||||
inputs_dict.pop("decoder_attention_mask", None)
|
||||
|
||||
for name, inp in inputs_dict.items():
|
||||
if isinstance(inp, torch.Tensor) and inp.dtype in [torch.float32, torch.float16]:
|
||||
inputs_dict[name] = inp.to(torch.float16)
|
||||
|
||||
with torch.backends.cuda.sdp_kernel(enable_flash=True, enable_math=False, enable_mem_efficient=False):
|
||||
_ = model(**inputs_dict)
|
||||
|
||||
@require_flash_attn
|
||||
@require_torch_gpu
|
||||
@mark.flash_attn_test
|
||||
@slow
|
||||
# Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_inference_equivalence_right_padding
|
||||
def test_flash_attn_2_inference_equivalence_right_padding(self):
|
||||
for model_class in self.all_model_classes:
|
||||
if not model_class._supports_flash_attn_2:
|
||||
@@ -1517,7 +1585,9 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
model.save_pretrained(tmpdirname)
|
||||
model_fa = model_class.from_pretrained(
|
||||
tmpdirname, torch_dtype=torch.bfloat16, attn_implementation="flash_attention_2"
|
||||
tmpdirname,
|
||||
torch_dtype=torch.bfloat16,
|
||||
attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None},
|
||||
)
|
||||
model_fa.to(torch_device)
|
||||
|
||||
@@ -1587,7 +1657,7 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
|
||||
@require_torch_gpu
|
||||
@mark.flash_attn_test
|
||||
@slow
|
||||
# Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_left_padding
|
||||
# Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_left_padding
|
||||
def test_flash_attn_2_generate_left_padding(self):
|
||||
# Ignore copy
|
||||
for model_class in self.greedy_sample_model_classes:
|
||||
@@ -1622,7 +1692,7 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
|
||||
model = model_class.from_pretrained(
|
||||
tmpdirname,
|
||||
torch_dtype=torch.float16,
|
||||
attn_implementation="flash_attention_2",
|
||||
attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None},
|
||||
low_cpu_mem_usage=True,
|
||||
).to(torch_device)
|
||||
|
||||
@@ -1636,7 +1706,7 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
|
||||
@require_torch_gpu
|
||||
@mark.flash_attn_test
|
||||
@slow
|
||||
# Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_padding_right
|
||||
# Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_padding_right
|
||||
def test_flash_attn_2_generate_padding_right(self):
|
||||
# Ignore copy
|
||||
for model_class in self.greedy_sample_model_classes:
|
||||
@@ -1670,7 +1740,7 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
|
||||
model = model_class.from_pretrained(
|
||||
tmpdirname,
|
||||
torch_dtype=torch.float16,
|
||||
attn_implementation="flash_attention_2",
|
||||
attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None},
|
||||
low_cpu_mem_usage=True,
|
||||
).to(torch_device)
|
||||
|
||||
@@ -1684,7 +1754,7 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
|
||||
@require_torch_gpu
|
||||
@mark.flash_attn_test
|
||||
@slow
|
||||
# Copied from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_use_cache
|
||||
# Adapted from tests.test_modeling_common.ModelTesterMixin.test_flash_attn_2_generate_use_cache
|
||||
def test_flash_attn_2_generate_use_cache(self):
|
||||
max_new_tokens = 30
|
||||
|
||||
@@ -1713,7 +1783,7 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
|
||||
model = model_class.from_pretrained(
|
||||
tmpdirname,
|
||||
torch_dtype=torch.float16,
|
||||
attn_implementation="flash_attention_2",
|
||||
attn_implementation={"decoder": "flash_attention_2", "audio_encoder": None, "text_encoder": None},
|
||||
low_cpu_mem_usage=True,
|
||||
).to(torch_device)
|
||||
|
||||
@@ -1726,6 +1796,53 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
|
||||
use_cache=True,
|
||||
)
|
||||
|
||||
@require_torch_sdpa
|
||||
def test_sdpa_can_dispatch_composite_models(self):
|
||||
if not self.has_attentions:
|
||||
self.skipTest(reason="Model architecture does not support attentions")
|
||||
|
||||
if not self._is_composite:
|
||||
self.skipTest(f"{self.all_model_classes[0].__name__} does not support SDPA")
|
||||
|
||||
for model_class in self.all_model_classes:
|
||||
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
|
||||
model = model_class(config)
|
||||
|
||||
with tempfile.TemporaryDirectory() as tmpdirname:
|
||||
model.save_pretrained(tmpdirname)
|
||||
model_sdpa = model_class.from_pretrained(tmpdirname)
|
||||
model_sdpa = model_sdpa.eval().to(torch_device)
|
||||
|
||||
audio_encoder_attn = "sdpa" if model.audio_encoder._supports_sdpa else "eager"
|
||||
text_encoder_attn = "sdpa" if model.text_encoder._supports_sdpa else "eager"
|
||||
decoder_attn = "sdpa" if model.decoder._supports_sdpa else "eager"
|
||||
|
||||
# `None` as it is the requested one which will be assigned to each sub-config
|
||||
# Sub-model will dispatch to SDPA if it can (checked below that `SDPA` layers are present)
|
||||
self.assertTrue(model_sdpa.audio_encoder.config._attn_implementation == audio_encoder_attn)
|
||||
self.assertTrue(model_sdpa.text_encoder.config._attn_implementation == text_encoder_attn)
|
||||
self.assertTrue(model_sdpa.decoder.config._attn_implementation == decoder_attn)
|
||||
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
|
||||
model_eager = model_class.from_pretrained(tmpdirname, attn_implementation="eager")
|
||||
model_eager = model_eager.eval().to(torch_device)
|
||||
|
||||
self.assertTrue(model_eager.audio_encoder.config._attn_implementation == "eager")
|
||||
self.assertTrue(model_eager.text_encoder.config._attn_implementation == "eager")
|
||||
self.assertTrue(model_eager.decoder.config._attn_implementation == "eager")
|
||||
self.assertTrue(model_eager.config._attn_implementation == "eager")
|
||||
|
||||
for name, submodule in model_eager.named_modules():
|
||||
if "SdpaAttention" in submodule.__class__.__name__:
|
||||
raise ValueError("The eager model should not have SDPA attention layers")
|
||||
|
||||
has_sdpa = False
|
||||
for name, submodule in model_sdpa.named_modules():
|
||||
if "SdpaAttention" in submodule.__class__.__name__:
|
||||
has_sdpa = True
|
||||
break
|
||||
if not has_sdpa and model_sdpa.config.model_type != "falcon":
|
||||
raise ValueError("The SDPA model should have SDPA attention layers")
|
||||
|
||||
@parameterized.expand([("float16",), ("bfloat16",), ("float32",)])
|
||||
@require_torch_sdpa
|
||||
@slow
|
||||
@@ -1792,8 +1909,6 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
|
||||
model_sdpa = model_class.from_pretrained(tmpdirname, torch_dtype=torch_dtype)
|
||||
model_sdpa = model_sdpa.eval().to(torch_device)
|
||||
|
||||
self.assertTrue(model_sdpa.config._attn_implementation == "sdpa")
|
||||
|
||||
model_eager = model_class.from_pretrained(
|
||||
tmpdirname,
|
||||
torch_dtype=torch_dtype,
|
||||
@@ -1801,20 +1916,6 @@ class MusicgenTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
|
||||
)
|
||||
model_eager = model_eager.eval().to(torch_device)
|
||||
|
||||
self.assertTrue(model_eager.config._attn_implementation == "eager")
|
||||
|
||||
for name, submodule in model_eager.named_modules():
|
||||
if "SdpaAttention" in submodule.__class__.__name__:
|
||||
raise ValueError("The eager model should not have SDPA attention layers")
|
||||
|
||||
has_sdpa = False
|
||||
for name, submodule in model_sdpa.named_modules():
|
||||
if "SdpaAttention" in submodule.__class__.__name__:
|
||||
has_sdpa = True
|
||||
break
|
||||
if not has_sdpa and model_sdpa.config.model_type != "falcon":
|
||||
raise ValueError("The SDPA model should have SDPA attention layers")
|
||||
|
||||
# We use these for loops instead of parameterized.expand just for the interest of avoiding loading/saving 8 times the model,
|
||||
# but it would be nicer to have an efficient way to use parameterized.expand
|
||||
fail_cases = []
|
||||
|
||||
Reference in New Issue
Block a user