Tests: move generate tests to the right mixin and delete redundant tests (#34464)

* tmp commit * tmp commit * cull overwrites of deleted tests * typo * more specific docstring * make fixup * parameterize at the top? * correction * more deletions :D * tmp commit * for VLMs too * fix _check_outputs * test nit * make fixup * fix another flaky * test_generate_from_inputs_embeds -- handle missing attention mask
2024-10-30 10:59:08 +00:00
parent 913330ca9f
commit 8a734ea2c3
46 changed files with 265 additions and 2348 deletions
--- a/tests/models/granitemoe/test_modeling_granitemoe.py
+++ b/tests/models/granitemoe/test_modeling_granitemoe.py
@@ -17,12 +17,10 @@
 import tempfile
 import unittest

-import pytest
 from parameterized import parameterized

 from transformers import AutoTokenizer, GraniteMoeConfig, is_torch_available, set_seed
 from transformers.testing_utils import (
-    require_bitsandbytes,
    require_flash_attn,
    require_read_token,
    require_torch,
@@ -302,9 +300,6 @@ class GraniteMoeModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test
    # This is because we are hitting edge cases with the causal_mask buffer
    model_split_percents = [0.5, 0.7, 0.8]

-    # used in `test_torch_compile`
-    _torch_compile_test_ckpt = "ibm/PowerMoE-3b"
-
    def setUp(self):
        self.model_tester = GraniteMoeModelTester(self)
        self.config_tester = ConfigTester(self, config_class=GraniteMoeConfig, hidden_size=37)
@@ -422,46 +417,6 @@ class GraniteMoeModelTest(ModelTesterMixin, GenerationTesterMixin, unittest.Test
        with self.assertRaises(AssertionError):
            torch.testing.assert_close(yarn_sin_long, original_sin_long)

-    @require_flash_attn
-    @require_torch_gpu
-    @require_bitsandbytes
-    @pytest.mark.flash_attn_test
-    @require_read_token
-    @slow
-    def test_flash_attn_2_generate_padding_right(self):
-        """
-        Overwritting the common test as the test is flaky on tiny models
-        """
-        model = GraniteMoeForCausalLM.from_pretrained(
-            "ibm-granite/granitemoe-3b",
-            load_in_4bit=True,
-            device_map={"": 0},
-        )
-
-        tokenizer = AutoTokenizer.from_pretrained("ibm-granite/granitemoe-3b")
-
-        texts = ["hi", "Hello this is a very long sentence"]
-
-        tokenizer.padding_side = "right"
-        tokenizer.pad_token = tokenizer.eos_token
-
-        inputs = tokenizer(texts, return_tensors="pt", padding=True).to(0)
-
-        output_native = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_native = tokenizer.batch_decode(output_native)
-
-        model = GraniteMoeForCausalLM.from_pretrained(
-            "ibm-granite/granitemoe-3b",
-            load_in_4bit=True,
-            device_map={"": 0},
-            attn_implementation="flash_attention_2",
-        )
-
-        output_fa_2 = model.generate(**inputs, max_new_tokens=20, do_sample=False)
-        output_fa_2 = tokenizer.batch_decode(output_fa_2)
-
-        self.assertListEqual(output_native, output_fa_2)
-
    @require_flash_attn
    @require_torch_gpu
    @slow