Refactor CI: more explicit (#30674)

* don't run custom when not needed?

* update test fetcher filtering

* fixup and updates

* update

* update

* reduce burden

* nit

* nit

* mising comma

* this?

* this?

* more parallelism

* more

* nit for real parallelism on tf and torch examples

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update

* update to make it more custom

* update to make it more custom

* update to make it more custom

* update to make it more custom

* update

* update

* update

* update

* update

* update

* use correct path

* fix path to test files and examples

* filter-tests

* filter?

* filter?

* filter?

* nits

* fix naming of the artifacts to be pushed

* list vs files

* list vs files

* fixup

* fix list of all tests

* fix the install steps

* fix the install steps

* fix the config

* fix the config

* only split if needed

* only split if needed

* extend should fix it

* extend should fix it

* arg

* arg

* update

* update

* run tests

* run tests

* run tests

* more nits

* update

* update

* update

* update

* update

* update

* update

* simpler way to show the test, reduces the complexity of the generated config

* simpler way to show the test, reduces the complexity of the generated config

* style

* oups

* oups

* fix import errors

* skip some tests for now

* update doctestjob

* more parallelism

* fixup

* test only the test in examples

* test only the test in examples

* nits

* from Arthur

* fix generated congi

* update

* update

* show tests

* oups

* oups

* fix torch job for now

* use single upload setp

* oups

* fu**k

* fix

* nit

* update

* nit

* fix

* fixes

* [test-all]

* add generate marker and generate job

* oups

* torch job runs not generate tests

* let repo utils test all utils

* UPdate

* styling

* fix repo utils test

* more parallel please

* don't test

* update

* bit more verbose sir

* more

* hub were skipped

* split by classname

* revert

* maybe?

* Amazing catch

Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>

* fix

* update

* update

* maybe non capturing

* manual convert?

* pass artifacts as parameters as otherwise the config is too long

* artifact.json

* store output

* might not be safe?

* my token

* mmm?

* use CI job IS

* can't get a proper id?

* ups

* build num

* update

* echo url

* this?

* this!

* fix

* wget

* ish

* dang

* udpdate

* there we go

* update

* update

* pass all

* not .txt

* update

* fetcg

* fix naming

* fix

* up

* update

* update

* ??

* update

* more updates

* update

* more

* skip

* oups

* pr documentation tests are currently created differently

* update

* hmmmm

* oups

* curl -L

* update

* ????

* nit

* mmmm

* ish

* ouf

* update

* ish

* update

* update

* updatea

* nit

* nit

* up

* oups

* documentation_test fix

* test hub tests everything, just marker

* update

* fix

* test_hub is the only annoying one now

* tf threads?

* oups

* not sure what is happening?

* fix?

* just use folder for stating hub

* I am getting fucking annoyed

* fix the test?

* update

* uupdate

* ?

* fixes

* add comment!

* nit

---------

Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
Co-authored-by: Yih-Dar <2521628+ydshieh@users.noreply.github.com>
This commit is contained in:
Arthur
2024-08-30 18:17:25 +02:00
committed by GitHub
parent 38d58a4427
commit b017a9eb11
10 changed files with 251 additions and 504 deletions

View File

@@ -21,6 +21,7 @@ import unittest
import warnings
import numpy as np
import pytest
from parameterized import parameterized
from transformers import is_torch_available, pipeline, set_seed
@@ -88,6 +89,7 @@ if is_torch_available():
from transformers.generation.utils import _speculative_sampling
@pytest.mark.generate
class GenerationTesterMixin:
model_tester = None
all_generative_model_classes = ()
@@ -417,6 +419,7 @@ class GenerationTesterMixin:
return output_generate
@pytest.mark.generate
def test_greedy_generate(self):
for model_class in self.all_generative_model_classes:
config, input_ids, attention_mask = self._get_input_ids_and_config()
@@ -429,6 +432,7 @@ class GenerationTesterMixin:
else:
self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
@pytest.mark.generate
def test_greedy_generate_dict_outputs(self):
for model_class in self.all_generative_model_classes:
config, input_ids, attention_mask = self._get_input_ids_and_config()
@@ -459,6 +463,7 @@ class GenerationTesterMixin:
self._check_outputs(output_generate, input_ids, model.config)
@pytest.mark.generate
def test_greedy_generate_dict_outputs_use_cache(self):
for model_class in self.all_generative_model_classes:
config, input_ids, attention_mask = self._get_input_ids_and_config()
@@ -488,6 +493,7 @@ class GenerationTesterMixin:
self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
self._check_outputs(output_generate, input_ids, model.config, use_cache=True)
@pytest.mark.generate
def test_sample_generate(self):
for model_class in self.all_generative_model_classes:
config, input_ids, attention_mask = self._get_input_ids_and_config()
@@ -505,6 +511,7 @@ class GenerationTesterMixin:
else:
self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
@pytest.mark.generate
def test_sample_generate_dict_output(self):
for model_class in self.all_generative_model_classes:
config, input_ids, attention_mask = self._get_input_ids_and_config()
@@ -536,6 +543,7 @@ class GenerationTesterMixin:
self._check_outputs(output_generate, input_ids, model.config, num_return_sequences=2)
@pytest.mark.generate
def test_beam_search_generate(self):
for model_class in self.all_generative_model_classes:
config, input_ids, attention_mask = self._get_input_ids_and_config()
@@ -555,6 +563,7 @@ class GenerationTesterMixin:
else:
self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
@pytest.mark.generate
def test_beam_search_generate_dict_output(self):
for model_class in self.all_generative_model_classes:
config, input_ids, attention_mask = self._get_input_ids_and_config()
@@ -588,6 +597,7 @@ class GenerationTesterMixin:
output_generate, input_ids, model.config, num_return_sequences=beam_kwargs["num_beams"]
)
@pytest.mark.generate
def test_beam_search_generate_dict_outputs_use_cache(self):
for model_class in self.all_generative_model_classes:
# enable cache
@@ -626,6 +636,7 @@ class GenerationTesterMixin:
@require_accelerate
@require_torch_multi_accelerator
@pytest.mark.generate
def test_model_parallel_beam_search(self):
for model_class in self.all_generative_model_classes:
if "xpu" in torch_device:
@@ -648,6 +659,7 @@ class GenerationTesterMixin:
num_beams=2,
)
@pytest.mark.generate
def test_beam_sample_generate(self):
for model_class in self.all_generative_model_classes:
config, input_ids, attention_mask = self._get_input_ids_and_config()
@@ -684,6 +696,7 @@ class GenerationTesterMixin:
torch.testing.assert_close(output_generate[:, input_embeds.shape[1] :], output_generate2)
@pytest.mark.generate
def test_beam_sample_generate_dict_output(self):
for model_class in self.all_generative_model_classes:
config, input_ids, attention_mask = self._get_input_ids_and_config()
@@ -719,6 +732,7 @@ class GenerationTesterMixin:
output_generate, input_ids, model.config, num_return_sequences=beam_kwargs["num_beams"]
)
@pytest.mark.generate
def test_generate_without_input_ids(self):
config, _, _ = self._get_input_ids_and_config()
@@ -739,6 +753,7 @@ class GenerationTesterMixin:
)
self.assertIsNotNone(output_ids_generate)
@pytest.mark.generate
def test_group_beam_search_generate(self):
for model_class in self.all_generative_model_classes:
config, input_ids, attention_mask = self._get_input_ids_and_config()
@@ -771,6 +786,7 @@ class GenerationTesterMixin:
else:
self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
@pytest.mark.generate
def test_group_beam_search_generate_dict_output(self):
for model_class in self.all_generative_model_classes:
config, input_ids, attention_mask = self._get_input_ids_and_config()
@@ -806,6 +822,7 @@ class GenerationTesterMixin:
# TODO: @gante
@is_flaky()
@pytest.mark.generate
def test_constrained_beam_search_generate(self):
for model_class in self.all_generative_model_classes:
config, input_ids, attention_mask = self._get_input_ids_and_config()
@@ -863,6 +880,7 @@ class GenerationTesterMixin:
for generation_output in output_generate:
self._check_sequence_inside_sequence(force_tokens, generation_output)
@pytest.mark.generate
def test_constrained_beam_search_generate_dict_output(self):
for model_class in self.all_generative_model_classes:
config, input_ids, attention_mask = self._get_input_ids_and_config()
@@ -907,6 +925,7 @@ class GenerationTesterMixin:
output_generate, input_ids, model.config, num_return_sequences=beam_kwargs["num_beams"]
)
@pytest.mark.generate
def test_contrastive_generate(self):
for model_class in self.all_generative_model_classes:
if model_class._is_stateful:
@@ -933,6 +952,7 @@ class GenerationTesterMixin:
else:
self.assertTrue(output_generate.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
@pytest.mark.generate
def test_contrastive_generate_dict_outputs_use_cache(self):
for model_class in self.all_generative_model_classes:
if model_class._is_stateful:
@@ -968,6 +988,7 @@ class GenerationTesterMixin:
self.assertTrue(output_generate.sequences.shape[-1] == self.max_new_tokens + input_ids.shape[-1])
self._check_outputs(output_generate, input_ids, model.config, use_cache=True)
@pytest.mark.generate
def test_contrastive_generate_low_memory(self):
# Check that choosing 'low_memory' does not change the model output
for model_class in self.all_generative_model_classes:
@@ -1011,6 +1032,7 @@ class GenerationTesterMixin:
)
self.assertListEqual(low_output.tolist(), high_output.tolist())
@pytest.mark.generate
def test_beam_search_low_memory(self):
# Check that choosing 'low_memory' does not change the model output
for model_class in self.all_generative_model_classes:
@@ -1053,6 +1075,7 @@ class GenerationTesterMixin:
)
self.assertListEqual(low_output.tolist(), high_output.tolist())
@pytest.mark.generate
@parameterized.expand([("random",), ("same",)])
@is_flaky() # Read NOTE (1) below. If there are API issues, all attempts will fail.
def test_assisted_decoding_matches_greedy_search(self, assistant_type):
@@ -1134,6 +1157,7 @@ class GenerationTesterMixin:
self._check_outputs(output, input_ids, model.config, use_cache=True)
@is_flaky()
@pytest.mark.generate
def test_prompt_lookup_decoding_matches_greedy_search(self):
# This test ensures that the prompt lookup generation does not introduce output changes over greedy search.
# This test is mostly a copy of test_assisted_decoding_matches_greedy_search
@@ -1196,6 +1220,7 @@ class GenerationTesterMixin:
for output in (output_greedy, output_prompt_lookup):
self._check_outputs(output, input_ids, model.config, use_cache=True)
@pytest.mark.generate
def test_dola_decoding_sample(self):
# TODO (joao): investigate skips, try to reduce incompatibilities
for model_class in self.all_generative_model_classes:
@@ -1240,6 +1265,7 @@ class GenerationTesterMixin:
output_dola = model.generate(input_ids, **model_kwargs, **generation_kwargs)
self._check_outputs(output_dola, input_ids, model.config, use_cache=hasattr(config, "use_cache"))
@pytest.mark.generate
def test_assisted_decoding_sample(self):
# In this test we don't check assisted vs non-assisted output -- seeded assisted decoding with sample will not
# match sample for the same seed, as the forward pass does not return the exact same logits (due to matmul with
@@ -1299,6 +1325,7 @@ class GenerationTesterMixin:
self._check_outputs(output_assisted, input_ids, model.config, use_cache=True)
@pytest.mark.generate
def test_prompt_lookup_decoding_stops_at_eos(self):
# This test ensures that the prompt lookup generation stops at eos token and does not suggest more tokens
# (see https://github.com/huggingface/transformers/pull/31301)
@@ -1327,6 +1354,7 @@ class GenerationTesterMixin:
# PLD shouldn't propose any new tokens based on eos-match
self.assertTrue(output_prompt_lookup.shape[-1] == 10)
@pytest.mark.generate
def test_generate_with_head_masking(self):
"""Test designed for encoder-decoder models to ensure the attention head masking is used."""
attention_names = ["encoder_attentions", "decoder_attentions", "cross_attentions"]
@@ -1366,6 +1394,7 @@ class GenerationTesterMixin:
attn_weights = out[attn_name] if attn_name == attention_names[0] else out[attn_name][-1]
self.assertEqual(sum([w.sum().item() for w in attn_weights]), 0.0)
@pytest.mark.generate
def test_left_padding_compatibility(self):
# NOTE: left-padding results in small numerical differences. This is expected.
# See https://github.com/huggingface/transformers/issues/25420#issuecomment-1775317535
@@ -1434,6 +1463,7 @@ class GenerationTesterMixin:
# They should result in very similar logits
self.assertTrue(torch.allclose(next_logits_wo_padding, next_logits_with_padding, atol=1e-5))
@pytest.mark.generate
def test_past_key_values_format(self):
# Test that the KV cache is formatted correctly. Exceptions need to explicitly overwrite this test. Having a
# standard KV cache format is important for a consistent API (and for advanced generation methods).
@@ -1505,6 +1535,7 @@ class GenerationTesterMixin:
past_kv[i][1].shape, (batch_size, num_attention_heads, seq_length, per_head_embed_dim)
)
@pytest.mark.generate
def test_generate_from_inputs_embeds_decoder_only(self):
# When supported, tests that the decoder model can generate from `inputs_embeds` instead of `input_ids`
# if fails, you should probably update the `prepare_inputs_for_generation` function
@@ -1555,6 +1586,7 @@ class GenerationTesterMixin:
outputs_from_embeds_wo_ids.tolist(),
)
@pytest.mark.generate
def test_generate_continue_from_past_key_values(self):
# Tests that we can continue generating from past key values, returned from a previous `generate` call
for model_class in self.all_generative_model_classes:
@@ -1638,6 +1670,7 @@ class GenerationTesterMixin:
)
@parameterized.expand([(1, False), (1, True), (4, False)])
@pytest.mark.generate
def test_new_cache_format(self, num_beams, do_sample):
# Tests that generating with the new format is exactly the same as the legacy one (for models that support it).
# 👉 tests with and without beam search so that we can test with and without cache reordering.
@@ -1702,6 +1735,7 @@ class GenerationTesterMixin:
)
)
@pytest.mark.generate
def test_generate_with_static_cache(self):
"""
Tests if StaticCache works if we set attn_implementation=static when generation.
@@ -1750,6 +1784,7 @@ class GenerationTesterMixin:
self.assertTrue(results.past_key_values.key_cache[0].shape == cache_shape)
@require_quanto
@pytest.mark.generate
def test_generate_with_quant_cache(self):
for model_class in self.all_generative_model_classes:
if not model_class._supports_quantized_cache:
@@ -1782,6 +1817,7 @@ class GenerationTesterMixin:
with self.assertRaises(ValueError):
model.generate(input_ids, attention_mask=attention_mask, **generation_kwargs)
@pytest.mark.generate
@require_torch_gpu
@slow
@is_flaky() # compilation may result in equivalent (!= same) FP ops, causing the argmax in `generate` to be flaky
@@ -2134,6 +2170,7 @@ class UtilsFunctionsTest(unittest.TestCase):
self.assertTrue(validated_tokens.tolist()[0] == [1, 4, 8])
@pytest.mark.generate
@require_torch
class GenerationIntegrationTests(unittest.TestCase, GenerationIntegrationTestsMixin):
# setting framework_dependent_parameters needs to be gated, just like its contents' imports