Fix some fa2 tests (#35340)

* remove fa2 test

* remove other failing tests

* style
This commit is contained in:
Arthur
2024-12-19 17:05:25 +01:00
committed by GitHub
parent 667ed5635e
commit 1fa807fa63
4 changed files with 0 additions and 119 deletions

View File

@@ -14,14 +14,12 @@
# limitations under the License. # limitations under the License.
"""Testing suite for the PyTorch Granite model.""" """Testing suite for the PyTorch Granite model."""
import tempfile
import unittest import unittest
from parameterized import parameterized from parameterized import parameterized
from transformers import GraniteConfig, is_torch_available, set_seed from transformers import GraniteConfig, is_torch_available, set_seed
from transformers.testing_utils import ( from transformers.testing_utils import (
require_flash_attn,
require_read_token, require_read_token,
require_torch, require_torch,
require_torch_gpu, require_torch_gpu,
@@ -417,33 +415,6 @@ class GraniteModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMi
with self.assertRaises(AssertionError): with self.assertRaises(AssertionError):
torch.testing.assert_close(yarn_sin_long, original_sin_long) torch.testing.assert_close(yarn_sin_long, original_sin_long)
@require_flash_attn
@require_torch_gpu
@slow
def test_use_flash_attention_2_true(self):
"""
NOTE: this is the only test testing that the legacy `use_flash_attention=2` argument still works as intended.
"""
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
with tempfile.TemporaryDirectory() as tmp_dir:
model = model_class(config)
model.save_pretrained(tmp_dir)
new_model = GraniteForCausalLM.from_pretrained(
tmp_dir, use_flash_attention_2=True, torch_dtype=torch.float16
).to("cuda")
self.assertTrue(new_model.config._attn_implementation == "flash_attention_2")
has_flash = False
for name, submodule in new_model.named_modules():
if "FlashAttention" in submodule.__class__.__name__:
has_flash = True
break
if not has_flash:
raise ValueError("The flash model should have flash attention layers")
@require_torch_gpu @require_torch_gpu
class GraniteIntegrationTest(unittest.TestCase): class GraniteIntegrationTest(unittest.TestCase):

View File

@@ -14,14 +14,12 @@
# limitations under the License. # limitations under the License.
"""Testing suite for the PyTorch GraniteMoe model.""" """Testing suite for the PyTorch GraniteMoe model."""
import tempfile
import unittest import unittest
from parameterized import parameterized from parameterized import parameterized
from transformers import AutoTokenizer, GraniteMoeConfig, is_torch_available, set_seed from transformers import AutoTokenizer, GraniteMoeConfig, is_torch_available, set_seed
from transformers.testing_utils import ( from transformers.testing_utils import (
require_flash_attn,
require_read_token, require_read_token,
require_torch, require_torch,
require_torch_gpu, require_torch_gpu,
@@ -416,33 +414,6 @@ class GraniteMoeModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test
with self.assertRaises(AssertionError): with self.assertRaises(AssertionError):
torch.testing.assert_close(yarn_sin_long, original_sin_long) torch.testing.assert_close(yarn_sin_long, original_sin_long)
@require_flash_attn
@require_torch_gpu
@slow
def test_use_flash_attention_2_true(self):
"""
NOTE: this is the only test testing that the legacy `use_flash_attention=2` argument still works as intended.
"""
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
with tempfile.TemporaryDirectory() as tmp_dir:
model = model_class(config)
model.save_pretrained(tmp_dir)
new_model = GraniteMoeForCausalLM.from_pretrained(
tmp_dir, use_flash_attention_2=True, torch_dtype=torch.float16
).to("cuda")
self.assertTrue(new_model.config._attn_implementation == "flash_attention_2")
has_flash = False
for name, submodule in new_model.named_modules():
if "FlashAttention" in submodule.__class__.__name__:
has_flash = True
break
if not has_flash:
raise ValueError("The flash model should have flash attention layers")
@require_torch_gpu @require_torch_gpu
class GraniteMoeIntegrationTest(unittest.TestCase): class GraniteMoeIntegrationTest(unittest.TestCase):

View File

@@ -14,10 +14,8 @@
# limitations under the License. # limitations under the License.
"""Testing suite for the PyTorch LLaMA model.""" """Testing suite for the PyTorch LLaMA model."""
import tempfile
import unittest import unittest
import pytest
from packaging import version from packaging import version
from parameterized import parameterized from parameterized import parameterized
@@ -25,7 +23,6 @@ from transformers import AutoTokenizer, LlamaConfig, StaticCache, is_torch_avail
from transformers.generation.configuration_utils import GenerationConfig from transformers.generation.configuration_utils import GenerationConfig
from transformers.testing_utils import ( from transformers.testing_utils import (
cleanup, cleanup,
require_flash_attn,
require_read_token, require_read_token,
require_torch, require_torch,
require_torch_accelerator, require_torch_accelerator,
@@ -543,34 +540,6 @@ class LlamaModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixi
with self.assertRaises(KeyError): with self.assertRaises(KeyError):
config = _reinitialize_config(base_config, {"rope_scaling": {"rope_type": "linear"}}) # missing "factor" config = _reinitialize_config(base_config, {"rope_scaling": {"rope_type": "linear"}}) # missing "factor"
@require_flash_attn
@require_torch_gpu
@slow
@pytest.mark.flash_attn_test
def test_use_flash_attention_2_true(self):
"""
NOTE: this is the only test testing that the legacy `use_flash_attention=2` argument still works as intended.
"""
config, inputs_dict = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
with tempfile.TemporaryDirectory() as tmp_dir:
model = model_class(config)
model.save_pretrained(tmp_dir)
new_model = LlamaForCausalLM.from_pretrained(
tmp_dir, use_flash_attention_2=True, torch_dtype=torch.float16
).to("cuda")
self.assertTrue(new_model.config._attn_implementation == "flash_attention_2")
has_flash = False
for name, submodule in new_model.named_modules():
if "FlashAttention" in submodule.__class__.__name__:
has_flash = True
break
if not has_flash:
raise ValueError("The flash model should have flash attention layers")
@require_torch_gpu @require_torch_gpu
class LlamaIntegrationTest(unittest.TestCase): class LlamaIntegrationTest(unittest.TestCase):

View File

@@ -2769,8 +2769,6 @@ class ModelTesterMixin:
attributes = tuple([f"{name}_{idx}" for idx in range(len(fx_outputs))]) attributes = tuple([f"{name}_{idx}" for idx in range(len(fx_outputs))])
for fx_output, pt_output, attr in zip(fx_outputs, pt_outputs, attributes): for fx_output, pt_output, attr in zip(fx_outputs, pt_outputs, attributes):
if isinstance(pt_output, DynamicCache):
pt_output = pt_output.to_legacy_cache()
self.check_pt_flax_outputs(fx_output, pt_output, model_class, tol=tol, name=attr) self.check_pt_flax_outputs(fx_output, pt_output, model_class, tol=tol, name=attr)
elif isinstance(fx_outputs, jnp.ndarray): elif isinstance(fx_outputs, jnp.ndarray):
@@ -3612,34 +3610,6 @@ class ModelTesterMixin:
num_params < 1000000 num_params < 1000000
), f"{model_class} is too big for the common tests ({num_params})! It should have 1M max." ), f"{model_class} is too big for the common tests ({num_params})! It should have 1M max."
@require_flash_attn
@require_torch_gpu
@mark.flash_attn_test
@slow
def test_flash_attn_2_conversion(self):
if not self.has_attentions:
self.skipTest(reason="Model architecture does not support attentions")
config, _ = self.model_tester.prepare_config_and_inputs_for_common()
for model_class in self.all_model_classes:
if not model_class._supports_flash_attn_2:
self.skipTest(f"{model_class.__name__} does not support Flash Attention 2")
model = model_class(config)
with tempfile.TemporaryDirectory() as tmpdirname:
model.save_pretrained(tmpdirname)
model = model_class.from_pretrained(
tmpdirname, torch_dtype=torch.float16, attn_implementation="flash_attention_2"
).to(torch_device)
for _, module in model.named_modules():
if "FlashAttention" in module.__class__.__name__:
return
self.assertTrue(False, "FlashAttention2 modules not found in model")
@require_flash_attn @require_flash_attn
@require_torch_gpu @require_torch_gpu
@mark.flash_attn_test @mark.flash_attn_test