From 1fa807fa63d1aa9409fb1ae0cbb7583960e5ea98 Mon Sep 17 00:00:00 2001 From: Arthur <48595927+ArthurZucker@users.noreply.github.com> Date: Thu, 19 Dec 2024 17:05:25 +0100 Subject: [PATCH] Fix some fa2 tests (#35340) * remove fa2 test * remove other failing tests * style --- tests/models/granite/test_modeling_granite.py | 29 ----------------- .../granitemoe/test_modeling_granitemoe.py | 29 ----------------- tests/models/llama/test_modeling_llama.py | 31 ------------------- tests/test_modeling_common.py | 30 ------------------ 4 files changed, 119 deletions(-) diff --git a/tests/models/granite/test_modeling_granite.py b/tests/models/granite/test_modeling_granite.py index 60eb964927..686544825c 100644 --- a/tests/models/granite/test_modeling_granite.py +++ b/tests/models/granite/test_modeling_granite.py @@ -14,14 +14,12 @@ # limitations under the License. """Testing suite for the PyTorch Granite model.""" -import tempfile import unittest from parameterized import parameterized from transformers import GraniteConfig, is_torch_available, set_seed from transformers.testing_utils import ( - require_flash_attn, require_read_token, require_torch, require_torch_gpu, @@ -417,33 +415,6 @@ class GraniteModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi with self.assertRaises(AssertionError): torch.testing.assert_close(yarn_sin_long, original_sin_long) - @require_flash_attn - @require_torch_gpu - @slow - def test_use_flash_attention_2_true(self): - """ - NOTE: this is the only test testing that the legacy `use_flash_attention=2` argument still works as intended. - """ - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - for model_class in self.all_model_classes: - with tempfile.TemporaryDirectory() as tmp_dir: - model = model_class(config) - model.save_pretrained(tmp_dir) - - new_model = GraniteForCausalLM.from_pretrained( - tmp_dir, use_flash_attention_2=True, torch_dtype=torch.float16 - ).to("cuda") - - self.assertTrue(new_model.config._attn_implementation == "flash_attention_2") - - has_flash = False - for name, submodule in new_model.named_modules(): - if "FlashAttention" in submodule.__class__.__name__: - has_flash = True - break - if not has_flash: - raise ValueError("The flash model should have flash attention layers") - @require_torch_gpu class GraniteIntegrationTest(unittest.TestCase): diff --git a/tests/models/granitemoe/test_modeling_granitemoe.py b/tests/models/granitemoe/test_modeling_granitemoe.py index 97af65667e..31307865a7 100644 --- a/tests/models/granitemoe/test_modeling_granitemoe.py +++ b/tests/models/granitemoe/test_modeling_granitemoe.py @@ -14,14 +14,12 @@ # limitations under the License. """Testing suite for the PyTorch GraniteMoe model.""" -import tempfile import unittest from parameterized import parameterized from transformers import AutoTokenizer, GraniteMoeConfig, is_torch_available, set_seed from transformers.testing_utils import ( - require_flash_attn, require_read_token, require_torch, require_torch_gpu, @@ -416,33 +414,6 @@ class GraniteMoeModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test with self.assertRaises(AssertionError): torch.testing.assert_close(yarn_sin_long, original_sin_long) - @require_flash_attn - @require_torch_gpu - @slow - def test_use_flash_attention_2_true(self): - """ - NOTE: this is the only test testing that the legacy `use_flash_attention=2` argument still works as intended. - """ - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - for model_class in self.all_model_classes: - with tempfile.TemporaryDirectory() as tmp_dir: - model = model_class(config) - model.save_pretrained(tmp_dir) - - new_model = GraniteMoeForCausalLM.from_pretrained( - tmp_dir, use_flash_attention_2=True, torch_dtype=torch.float16 - ).to("cuda") - - self.assertTrue(new_model.config._attn_implementation == "flash_attention_2") - - has_flash = False - for name, submodule in new_model.named_modules(): - if "FlashAttention" in submodule.__class__.__name__: - has_flash = True - break - if not has_flash: - raise ValueError("The flash model should have flash attention layers") - @require_torch_gpu class GraniteMoeIntegrationTest(unittest.TestCase): diff --git a/tests/models/llama/test_modeling_llama.py b/tests/models/llama/test_modeling_llama.py index 78e42e6ba7..feca640bb4 100644 --- a/tests/models/llama/test_modeling_llama.py +++ b/tests/models/llama/test_modeling_llama.py @@ -14,10 +14,8 @@ # limitations under the License. """Testing suite for the PyTorch LLaMA model.""" -import tempfile import unittest -import pytest from packaging import version from parameterized import parameterized @@ -25,7 +23,6 @@ from transformers import AutoTokenizer, LlamaConfig, StaticCache, is_torch_avail from transformers.generation.configuration_utils import GenerationConfig from transformers.testing_utils import ( cleanup, - require_flash_attn, require_read_token, require_torch, require_torch_accelerator, @@ -543,34 +540,6 @@ class LlamaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi with self.assertRaises(KeyError): config = _reinitialize_config(base_config, {"rope_scaling": {"rope_type": "linear"}}) # missing "factor" - @require_flash_attn - @require_torch_gpu - @slow - @pytest.mark.flash_attn_test - def test_use_flash_attention_2_true(self): - """ - NOTE: this is the only test testing that the legacy `use_flash_attention=2` argument still works as intended. - """ - config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common() - for model_class in self.all_model_classes: - with tempfile.TemporaryDirectory() as tmp_dir: - model = model_class(config) - model.save_pretrained(tmp_dir) - - new_model = LlamaForCausalLM.from_pretrained( - tmp_dir, use_flash_attention_2=True, torch_dtype=torch.float16 - ).to("cuda") - - self.assertTrue(new_model.config._attn_implementation == "flash_attention_2") - - has_flash = False - for name, submodule in new_model.named_modules(): - if "FlashAttention" in submodule.__class__.__name__: - has_flash = True - break - if not has_flash: - raise ValueError("The flash model should have flash attention layers") - @require_torch_gpu class LlamaIntegrationTest(unittest.TestCase): diff --git a/tests/test_modeling_common.py b/tests/test_modeling_common.py index 5f053c20ff..f150477c62 100755 --- a/tests/test_modeling_common.py +++ b/tests/test_modeling_common.py @@ -2769,8 +2769,6 @@ class ModelTesterMixin: attributes = tuple([f"{name}_{idx}" for idx in range(len(fx_outputs))]) for fx_output, pt_output, attr in zip(fx_outputs, pt_outputs, attributes): - if isinstance(pt_output, DynamicCache): - pt_output = pt_output.to_legacy_cache() self.check_pt_flax_outputs(fx_output, pt_output, model_class, tol=tol, name=attr) elif isinstance(fx_outputs, jnp.ndarray): @@ -3612,34 +3610,6 @@ class ModelTesterMixin: num_params < 1000000 ), f"{model_class} is too big for the common tests ({num_params})! It should have 1M max." - @require_flash_attn - @require_torch_gpu - @mark.flash_attn_test - @slow - def test_flash_attn_2_conversion(self): - if not self.has_attentions: - self.skipTest(reason="Model architecture does not support attentions") - - config, _ = self.model_tester.prepare_config_and_inputs_for_common() - - for model_class in self.all_model_classes: - if not model_class._supports_flash_attn_2: - self.skipTest(f"{model_class.__name__} does not support Flash Attention 2") - - model = model_class(config) - - with tempfile.TemporaryDirectory() as tmpdirname: - model.save_pretrained(tmpdirname) - model = model_class.from_pretrained( - tmpdirname, torch_dtype=torch.float16, attn_implementation="flash_attention_2" - ).to(torch_device) - - for _, module in model.named_modules(): - if "FlashAttention" in module.__class__.__name__: - return - - self.assertTrue(False, "FlashAttention2 modules not found in model") - @require_flash_attn @require_torch_gpu @mark.flash_attn_test