Refactor CI: more explicit (#30674)

* don't run custom when not needed? * update test fetcher filtering * fixup and updates * update * update * reduce burden * nit * nit * mising comma * this? * this? * more parallelism * more * nit for real parallelism on tf and torch examples * update * update * update * update * update * update * update * update * update * update * update * update * update to make it more custom * update to make it more custom * update to make it more custom * update to make it more custom * update * update * update * update * update * update * use correct path * fix path to test files and examples * filter-tests * filter? * filter? * filter? * nits * fix naming of the artifacts to be pushed * list vs files * list vs files * fixup * fix list of all tests * fix the install steps * fix the install steps * fix the config * fix the config * only split if needed * only split if needed * extend should fix it * extend should fix it * arg * arg * update * update * run tests * run tests * run tests * more nits * update * update * update * update * update * update * update * simpler way to show the test, reduces the complexity of the generated config * simpler way to show the test, reduces the complexity of the generated config * style * oups * oups * fix import errors * skip some tests for now * update doctestjob * more parallelism * fixup * test only the test in examples * test only the test in examples * nits * from Arthur * fix generated congi * update * update * show tests * oups * oups * fix torch job for now * use single upload setp * oups * fu**k * fix * nit * update * nit * fix * fixes * [test-all] * add generate marker and generate job * oups * torch job runs not generate tests * let repo utils test all utils * UPdate * styling * fix repo utils test * more parallel please * don't test * update * bit more verbose sir * more * hub were skipped * split by classname * revert * maybe? * Amazing catch Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com> * fix * update * update * maybe non capturing * manual convert? * pass artifacts as parameters as otherwise the config is too long * artifact.json * store output * might not be safe? * my token * mmm? * use CI job IS * can't get a proper id? * ups * build num * update * echo url * this? * this! * fix * wget * ish * dang * udpdate * there we go * update * update * pass all * not .txt * update * fetcg * fix naming * fix * up * update * update * ?? * update * more updates * update * more * skip * oups * pr documentation tests are currently created differently * update * hmmmm * oups * curl -L * update * ???? * nit * mmmm * ish * ouf * update * ish * update * update * updatea * nit * nit * up * oups * documentation_test fix * test hub tests everything, just marker * update * fix * test_hub is the only annoying one now * tf threads? * oups * not sure what is happening? * fix? * just use folder for stating hub * I am getting fucking annoyed * fix the test? * update * uupdate * ? * fixes * add comment! * nit --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com> Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
2024-08-30 18:17:25 +02:00
parent 38d58a4427
commit b017a9eb11
10 changed files with 251 additions and 504 deletions
--- a/tests/generation/test_utils.py
+++ b/tests/generation/test_utils.py
@@ -21,6 +21,7 @@ import unittest
 import warnings

 import numpy as np
+import pytest
 from parameterized import parameterized

 from transformers import is_torch_available, pipeline, set_seed
@@ -88,6 +89,7 @@ if is_torch_available():
    from transformers.generation.utils import _speculative_sampling


+@pytest.mark.generate
 class GenerationTesterMixin:
    model_tester = None
    all_generative_model_classes = ()
@@ -417,6 +419,7 @@ class GenerationTesterMixin:

        return output_generate

+    @pytest.mark.generate
    def test_greedy_generate(self):
        for model_class in self.all_generative_model_classes:
            config, input_ids, attention_mask = self._get_input_ids_and_config()
@@ -429,6 +432,7 @@ class GenerationTesterMixin:
            else:
                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])

+    @pytest.mark.generate
    def test_greedy_generate_dict_outputs(self):
        for model_class in self.all_generative_model_classes:
            config, input_ids, attention_mask = self._get_input_ids_and_config()
@@ -459,6 +463,7 @@ class GenerationTesterMixin:

            self._check_outputs(output_generate, input_ids, model.config)

+    @pytest.mark.generate
    def test_greedy_generate_dict_outputs_use_cache(self):
        for model_class in self.all_generative_model_classes:
            config, input_ids, attention_mask = self._get_input_ids_and_config()
@@ -488,6 +493,7 @@ class GenerationTesterMixin:
                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
            self._check_outputs(output_generate, input_ids, model.config, use_cache=True)

+    @pytest.mark.generate
    def test_sample_generate(self):
        for model_class in self.all_generative_model_classes:
            config, input_ids, attention_mask = self._get_input_ids_and_config()
@@ -505,6 +511,7 @@ class GenerationTesterMixin:
            else:
                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])

+    @pytest.mark.generate
    def test_sample_generate_dict_output(self):
        for model_class in self.all_generative_model_classes:
            config, input_ids, attention_mask = self._get_input_ids_and_config()
@@ -536,6 +543,7 @@ class GenerationTesterMixin:

            self._check_outputs(output_generate, input_ids, model.config, num_return_sequences=2)

+    @pytest.mark.generate
    def test_beam_search_generate(self):
        for model_class in self.all_generative_model_classes:
            config, input_ids, attention_mask = self._get_input_ids_and_config()
@@ -555,6 +563,7 @@ class GenerationTesterMixin:
            else:
                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])

+    @pytest.mark.generate
    def test_beam_search_generate_dict_output(self):
        for model_class in self.all_generative_model_classes:
            config, input_ids, attention_mask = self._get_input_ids_and_config()
@@ -588,6 +597,7 @@ class GenerationTesterMixin:
                output_generate, input_ids, model.config, num_return_sequences=beam_kwargs["num_beams"]
            )

+    @pytest.mark.generate
    def test_beam_search_generate_dict_outputs_use_cache(self):
        for model_class in self.all_generative_model_classes:
            # enable cache
@@ -626,6 +636,7 @@ class GenerationTesterMixin:

    @require_accelerate
    @require_torch_multi_accelerator
+    @pytest.mark.generate
    def test_model_parallel_beam_search(self):
        for model_class in self.all_generative_model_classes:
            if "xpu" in torch_device:
@@ -648,6 +659,7 @@ class GenerationTesterMixin:
                    num_beams=2,
                )

+    @pytest.mark.generate
    def test_beam_sample_generate(self):
        for model_class in self.all_generative_model_classes:
            config, input_ids, attention_mask = self._get_input_ids_and_config()
@@ -684,6 +696,7 @@ class GenerationTesterMixin:

                torch.testing.assert_close(output_generate[:, input_embeds.shape[1] :], output_generate2)

+    @pytest.mark.generate
    def test_beam_sample_generate_dict_output(self):
        for model_class in self.all_generative_model_classes:
            config, input_ids, attention_mask = self._get_input_ids_and_config()
@@ -719,6 +732,7 @@ class GenerationTesterMixin:
                output_generate, input_ids, model.config, num_return_sequences=beam_kwargs["num_beams"]
            )

+    @pytest.mark.generate
    def test_generate_without_input_ids(self):
        config, _, _ = self._get_input_ids_and_config()

@@ -739,6 +753,7 @@ class GenerationTesterMixin:
            )
            self.assertIsNotNone(output_ids_generate)

+    @pytest.mark.generate
    def test_group_beam_search_generate(self):
        for model_class in self.all_generative_model_classes:
            config, input_ids, attention_mask = self._get_input_ids_and_config()
@@ -771,6 +786,7 @@ class GenerationTesterMixin:
            else:
                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])

+    @pytest.mark.generate
    def test_group_beam_search_generate_dict_output(self):
        for model_class in self.all_generative_model_classes:
            config, input_ids, attention_mask = self._get_input_ids_and_config()
@@ -806,6 +822,7 @@ class GenerationTesterMixin:

    # TODO: @gante
    @is_flaky()
+    @pytest.mark.generate
    def test_constrained_beam_search_generate(self):
        for model_class in self.all_generative_model_classes:
            config, input_ids, attention_mask = self._get_input_ids_and_config()
@@ -863,6 +880,7 @@ class GenerationTesterMixin:
            for generation_output in output_generate:
                self._check_sequence_inside_sequence(force_tokens, generation_output)

+    @pytest.mark.generate
    def test_constrained_beam_search_generate_dict_output(self):
        for model_class in self.all_generative_model_classes:
            config, input_ids, attention_mask = self._get_input_ids_and_config()
@@ -907,6 +925,7 @@ class GenerationTesterMixin:
                output_generate, input_ids, model.config, num_return_sequences=beam_kwargs["num_beams"]
            )

+    @pytest.mark.generate
    def test_contrastive_generate(self):
        for model_class in self.all_generative_model_classes:
            if model_class._is_stateful:
@@ -933,6 +952,7 @@ class GenerationTesterMixin:
            else:
                self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])

+    @pytest.mark.generate
    def test_contrastive_generate_dict_outputs_use_cache(self):
        for model_class in self.all_generative_model_classes:
            if model_class._is_stateful:
@@ -968,6 +988,7 @@ class GenerationTesterMixin:
                self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
            self._check_outputs(output_generate, input_ids, model.config, use_cache=True)

+    @pytest.mark.generate
    def test_contrastive_generate_low_memory(self):
        # Check that choosing 'low_memory' does not change the model output
        for model_class in self.all_generative_model_classes:
@@ -1011,6 +1032,7 @@ class GenerationTesterMixin:
            )
            self.assertListEqual(low_output.tolist(), high_output.tolist())

+    @pytest.mark.generate
    def test_beam_search_low_memory(self):
        # Check that choosing 'low_memory' does not change the model output
        for model_class in self.all_generative_model_classes:
@@ -1053,6 +1075,7 @@ class GenerationTesterMixin:
            )
            self.assertListEqual(low_output.tolist(), high_output.tolist())

+    @pytest.mark.generate
    @parameterized.expand([("random",), ("same",)])
    @is_flaky()  # Read NOTE (1) below. If there are API issues, all attempts will fail.
    def test_assisted_decoding_matches_greedy_search(self, assistant_type):
@@ -1134,6 +1157,7 @@ class GenerationTesterMixin:
                self._check_outputs(output, input_ids, model.config, use_cache=True)

    @is_flaky()
+    @pytest.mark.generate
    def test_prompt_lookup_decoding_matches_greedy_search(self):
        # This test ensures that the prompt lookup generation does not introduce output changes over greedy search.
        # This test is mostly a copy of test_assisted_decoding_matches_greedy_search
@@ -1196,6 +1220,7 @@ class GenerationTesterMixin:
            for output in (output_greedy, output_prompt_lookup):
                self._check_outputs(output, input_ids, model.config, use_cache=True)

+    @pytest.mark.generate
    def test_dola_decoding_sample(self):
        # TODO (joao): investigate skips, try to reduce incompatibilities
        for model_class in self.all_generative_model_classes:
@@ -1240,6 +1265,7 @@ class GenerationTesterMixin:
            output_dola = model.generate(input_ids, **model_kwargs, **generation_kwargs)
            self._check_outputs(output_dola, input_ids, model.config, use_cache=hasattr(config, "use_cache"))

+    @pytest.mark.generate
    def test_assisted_decoding_sample(self):
        # In this test we don't check assisted vs non-assisted output -- seeded assisted decoding with sample will not
        # match sample for the same seed, as the forward pass does not return the exact same logits (due to matmul with
@@ -1299,6 +1325,7 @@ class GenerationTesterMixin:

            self._check_outputs(output_assisted, input_ids, model.config, use_cache=True)

+    @pytest.mark.generate
    def test_prompt_lookup_decoding_stops_at_eos(self):
        # This test ensures that the prompt lookup generation stops at eos token and does not suggest more tokens
        # (see https://github.com/huggingface/transformers/pull/31301)
@@ -1327,6 +1354,7 @@ class GenerationTesterMixin:
        # PLD shouldn't propose any new tokens based on eos-match
        self.assertTrue(output_prompt_lookup.shape[-1] == 10)

+    @pytest.mark.generate
    def test_generate_with_head_masking(self):
        """Test designed for encoder-decoder models to ensure the attention head masking is used."""
        attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
@@ -1366,6 +1394,7 @@ class GenerationTesterMixin:
                attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
                self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)

+    @pytest.mark.generate
    def test_left_padding_compatibility(self):
        # NOTE: left-padding results in small numerical differences. This is expected.
        # See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535
@@ -1434,6 +1463,7 @@ class GenerationTesterMixin:
            # They should result in very similar logits
            self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-5))

+    @pytest.mark.generate
    def test_past_key_values_format(self):
        # Test that the KV cache is formatted correctly. Exceptions need to explicitly overwrite this test. Having a
        # standard KV cache format is important for a consistent API (and for advanced generation methods).
@@ -1505,6 +1535,7 @@ class GenerationTesterMixin:
                        past_kv[i][1].shape, (batch_size, num_attention_heads, seq_length, per_head_embed_dim)
                    )

+    @pytest.mark.generate
    def test_generate_from_inputs_embeds_decoder_only(self):
        # When supported, tests that the decoder model can generate from `inputs_embeds` instead of `input_ids`
        # if fails, you should probably update the `prepare_inputs_for_generation` function
@@ -1555,6 +1586,7 @@ class GenerationTesterMixin:
                outputs_from_embeds_wo_ids.tolist(),
            )

+    @pytest.mark.generate
    def test_generate_continue_from_past_key_values(self):
        # Tests that we can continue generating from past key values, returned from a previous `generate` call
        for model_class in self.all_generative_model_classes:
@@ -1638,6 +1670,7 @@ class GenerationTesterMixin:
                    )

    @parameterized.expand([(1, False), (1, True), (4, False)])
+    @pytest.mark.generate
    def test_new_cache_format(self, num_beams, do_sample):
        # Tests that generating with the new format is exactly the same as the legacy one (for models that support it).
        # 👉 tests with and without beam search so that we can test with and without cache reordering.
@@ -1702,6 +1735,7 @@ class GenerationTesterMixin:
                        )
                    )

+    @pytest.mark.generate
    def test_generate_with_static_cache(self):
        """
        Tests if StaticCache works if we set attn_implementation=static when generation.
@@ -1750,6 +1784,7 @@ class GenerationTesterMixin:
            self.assertTrue(results.past_key_values.key_cache[0].shape == cache_shape)

    @require_quanto
+    @pytest.mark.generate
    def test_generate_with_quant_cache(self):
        for model_class in self.all_generative_model_classes:
            if not model_class._supports_quantized_cache:
@@ -1782,6 +1817,7 @@ class GenerationTesterMixin:
            with self.assertRaises(ValueError):
                model.generate(input_ids, attention_mask=attention_mask, **generation_kwargs)

+    @pytest.mark.generate
    @require_torch_gpu
    @slow
    @is_flaky()  # compilation may result in equivalent (!= same) FP ops, causing the argmax in `generate` to be flaky
@@ -2134,6 +2170,7 @@ class UtilsFunctionsTest(unittest.TestCase):
        self.assertTrue(validated_tokens.tolist()[0] == [1, 4, 8])


+@pytest.mark.generate
@require_torch
 class GenerationIntegrationTests(unittest.TestCase, GenerationIntegrationTestsMixin):
    # setting framework_dependent_parameters needs to be gated, just like its contents' imports