Skip tests properly (#31308)

* Skip tests properly

* [test_all]

* Add 'reason' as kwarg for skipTest

* [test_all] Fix up

* [test_all]
This commit is contained in:
amyeroberts
2024-06-26 21:59:08 +01:00
committed by GitHub
parent 1f9f57ab4c
commit 1de7dc7403
254 changed files with 1721 additions and 1298 deletions

View File

@@ -417,7 +417,7 @@ class TokenizerTesterMixin:
def test_sentencepiece_tokenize_and_convert_tokens_to_string(self):
"""Test ``_tokenize`` and ``convert_tokens_to_string``."""
if not self.test_sentencepiece:
return
self.skipTest(reason="test_sentencepiece is set to False")
tokenizer = self.get_tokenizer()
text = "This is text to test the tokenizer."
@@ -449,7 +449,7 @@ class TokenizerTesterMixin:
def test_sentencepiece_tokenize_and_decode(self):
if not self.test_sentencepiece:
return
self.skipTest(reason="test_sentencepiece is set to False")
text = "This is text to test the tokenizer."
if self.test_rust_tokenizer:
@@ -466,7 +466,7 @@ class TokenizerTesterMixin:
def test_subword_regularization_tokenizer(self) -> None:
if not self.test_sentencepiece:
return
self.skipTest(reason="test_sentencepiece is set to False")
# Subword regularization is only available for the slow tokenizer.
sp_model_kwargs = {"enable_sampling": True, "alpha": 0.1, "nbest_size": -1}
@@ -484,7 +484,7 @@ class TokenizerTesterMixin:
def test_pickle_subword_regularization_tokenizer(self) -> None:
if not self.test_sentencepiece:
return
self.skipTest(reason="test_sentencepiece is set to False")
"""Google pickle __getstate__ __setstate__ if you are struggling with this."""
# Subword regularization is only available for the slow tokenizer.
@@ -506,7 +506,7 @@ class TokenizerTesterMixin:
def test_save_sentencepiece_tokenizer(self) -> None:
if not self.test_sentencepiece or not self.test_slow_tokenizer:
return
self.skipTest(reason="test_sentencepiece or test_slow_tokenizer is set to False")
# We want to verify that we will be able to save the tokenizer even if the original files that were used to
# build the tokenizer have been deleted in the meantime.
text = "This is text to test the tokenizer."
@@ -545,7 +545,7 @@ class TokenizerTesterMixin:
def test_rust_tokenizer_signature(self):
if not self.test_rust_tokenizer:
return
self.skipTest(reason="test_rust_tokenizer is set to False")
signature = inspect.signature(self.rust_tokenizer_class.__init__)
@@ -554,7 +554,7 @@ class TokenizerTesterMixin:
def test_tokenizer_slow_store_full_signature(self):
if not self.test_slow_tokenizer:
return
self.skipTest(reason="test_slow_tokenizer is set to False")
signature = inspect.signature(self.tokenizer_class.__init__)
tokenizer = self.get_tokenizer()
@@ -565,7 +565,7 @@ class TokenizerTesterMixin:
def test_tokenizer_fast_store_full_signature(self):
if not self.test_rust_tokenizer:
return
self.skipTest(reason="test_rust_tokenizer is set to False")
signature = inspect.signature(self.rust_tokenizer_class.__init__)
tokenizer = self.get_rust_tokenizer()
@@ -580,11 +580,11 @@ class TokenizerTesterMixin:
def test_rust_and_python_full_tokenizers(self):
if not self.test_rust_tokenizer:
return
self.skipTest(reason="test_rust_tokenizer is set to False")
if not self.test_slow_tokenizer:
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return
self.skipTest(reason="test_slow_tokenizer is set to False")
tokenizer = self.get_tokenizer()
rust_tokenizer = self.get_rust_tokenizer()
@@ -1973,7 +1973,7 @@ class TokenizerTesterMixin:
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
if tokenizer.pad_token is None:
self.skipTest("No padding token.")
self.skipTest(reason="No padding token.")
else:
empty_tokens = tokenizer("", padding=True, pad_to_multiple_of=8)
normal_tokens = tokenizer("This is a sample input", padding=True, pad_to_multiple_of=8)
@@ -2007,9 +2007,9 @@ class TokenizerTesterMixin:
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
if tokenizer.pad_token is None:
self.skipTest("No padding token.")
self.skipTest(reason="No padding token.")
if "attention_mask" not in tokenizer.model_input_names:
self.skipTest("This model does not use attention mask.")
self.skipTest(reason="This model does not use attention mask.")
features = [
{"input_ids": [1, 2, 3, 4, 5, 6], "attention_mask": [1, 1, 1, 1, 1, 0]},
@@ -2126,7 +2126,7 @@ class TokenizerTesterMixin:
def test_padding_warning_message_fast_tokenizer(self):
if not self.test_rust_tokenizer:
return
self.skipTest(reason="test_rust_tokenizer is set to False")
sequence = "This is a text"
@@ -2146,7 +2146,7 @@ class TokenizerTesterMixin:
)
if not self.test_slow_tokenizer:
return
self.skipTest(reason="test_slow_tokenizer is set to False")
tokenizer_slow = self.get_tokenizer()
# check correct behaviour if no pad_token_id exists and add it eventually
@@ -2295,8 +2295,8 @@ class TokenizerTesterMixin:
@require_tokenizers
def test_added_token_are_matched_longest_first(self):
if not self.test_slow_tokenizer:
self.skipTest("This test is only for slow tokenizers")
return
self.skipTest(reason="This test is only for slow tokenizers")
tokenizers = self.get_tokenizers(fast=False)
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
@@ -2305,7 +2305,7 @@ class TokenizerTesterMixin:
tokenizer.add_tokens([AddedToken("extra_id_100")])
except Exception:
# Canine cannot add tokens which are not codepoints
self.skipTest("Cannot add those Added tokens")
self.skipTest(reason="Cannot add those Added tokens")
# XXX: This used to split on `extra_id_1` first we're matching
# longest first now.
@@ -2588,13 +2588,13 @@ class TokenizerTesterMixin:
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
return
self.skipTest(f"{tokenizer.__class__.__name__} is not in the MODEL_TOKENIZER")
config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
config = config_class()
if config.is_encoder_decoder or config.pad_token_id is None:
return
self.skipTest(reason="Model is not an encoder-decoder model or has no set pad token id")
model = model_class(config)
@@ -2637,13 +2637,13 @@ class TokenizerTesterMixin:
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
return
self.skipTest(f"{tokenizer.__class__.__name__} is not in the MODEL_TOKENIZER_MAPPING")
config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
config = config_class()
if config.is_encoder_decoder or config.pad_token_id is None:
return
self.skipTest(reason="Model is not an encoder-decoder model or has no set pad token id")
model = model_class(config)
@@ -2672,13 +2672,13 @@ class TokenizerTesterMixin:
for tokenizer in tokenizers:
with self.subTest(f"{tokenizer.__class__.__name__}"):
if tokenizer.__class__ not in MODEL_TOKENIZER_MAPPING:
return
self.skipTest(f"{tokenizer.__class__.__name__} is not in the MODEL_TOKENIZER_MAPPING")
config_class, model_class = MODEL_TOKENIZER_MAPPING[tokenizer.__class__]
config = config_class()
if config.is_encoder_decoder or config.pad_token_id is None:
return
self.skip("Model is not an encoder-decoder model or has no set pad token id")
# Build sequence
first_ten_tokens = list(tokenizer.get_vocab().keys())[:10]
@@ -2712,7 +2712,7 @@ class TokenizerTesterMixin:
@require_torch
def test_prepare_seq2seq_batch(self):
if not self.test_seq2seq:
return
self.skipTest(reason="test_seq2seq is set to False")
tokenizers = self.get_tokenizers()
for tokenizer in tokenizers:
@@ -2740,7 +2740,7 @@ class TokenizerTesterMixin:
src_lang="en_XX", # this should be ignored (for all but mbart) but not cause an error
)
except NotImplementedError:
return
self.skipTest(reason="Encountered NotImplementedError calling prepare_seq2seq_batch")
self.assertEqual(batch.input_ids.shape[1], 3)
self.assertEqual(batch.labels.shape[1], 10)
# max_target_length will default to max_length if not specified
@@ -3008,7 +3008,7 @@ class TokenizerTesterMixin:
def test_tokenization_python_rust_equals(self):
if not self.test_slow_tokenizer:
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return
self.skipTest(reason="test_slow_tokenizer is set to False")
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -3049,7 +3049,7 @@ class TokenizerTesterMixin:
def test_num_special_tokens_to_add_equal(self):
if not self.test_slow_tokenizer:
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return
self.skipTest(reason="test_slow_tokenizer is set to False")
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -3067,7 +3067,7 @@ class TokenizerTesterMixin:
def test_max_length_equal(self):
if not self.test_slow_tokenizer:
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return
self.skipTest(reason="test_slow_tokenizer is set to False")
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -3081,7 +3081,7 @@ class TokenizerTesterMixin:
def test_special_tokens_map_equal(self):
if not self.test_slow_tokenizer:
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return
self.skipTest(reason="test_slow_tokenizer is set to False")
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -3177,10 +3177,10 @@ class TokenizerTesterMixin:
elif is_flax_available():
returned_tensor = "jax"
else:
return
self.skipTest(reason="No expected framework from PT, TF or JAX found")
if not tokenizer.pad_token or tokenizer.pad_token_id < 0:
return
self.skipTest(reason="This tokenizer has no padding token set, or pad_token_id < 0")
tokens = tokenizer.encode_plus(
"HuggingFace is solving NLP one commit at a time",
@@ -3225,7 +3225,7 @@ class TokenizerTesterMixin:
def test_compare_pretokenized_inputs(self):
if not self.test_slow_tokenizer:
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return
self.skipTest(reason="test_slow_tokenizer is set to False")
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -3307,7 +3307,7 @@ class TokenizerTesterMixin:
def test_create_token_type_ids(self):
if not self.test_slow_tokenizer:
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return
self.skipTest(reason="test_slow_tokenizer is set to False")
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -3329,7 +3329,7 @@ class TokenizerTesterMixin:
def test_build_inputs_with_special_tokens(self):
if not self.test_slow_tokenizer:
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return
self.skipTest(reason="test_slow_tokenizer is set to False")
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -3374,7 +3374,7 @@ class TokenizerTesterMixin:
def test_padding(self, max_length=50):
if not self.test_slow_tokenizer:
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return
self.skipTest(reason="test_slow_tokenizer is set to False")
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -3600,7 +3600,7 @@ class TokenizerTesterMixin:
def test_padding_different_model_input_name(self):
if not self.test_slow_tokenizer:
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return
self.skipTest(reason="test_slow_tokenizer is set to False")
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -3638,7 +3638,7 @@ class TokenizerTesterMixin:
def test_save_pretrained(self):
if not self.test_slow_tokenizer:
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return
self.skipTest(reason="test_slow_tokenizer is set to False")
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -3713,7 +3713,7 @@ class TokenizerTesterMixin:
def test_embeded_special_tokens(self):
if not self.test_slow_tokenizer:
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return
self.skipTest(reason="test_slow_tokenizer is set to False")
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -3781,7 +3781,7 @@ class TokenizerTesterMixin:
def test_compare_prepare_for_model(self):
if not self.test_slow_tokenizer:
# as we don't have a slow version, we can't compare the outputs between slow and fast versions
return
self.skipTest(reason="test_slow_tokenizer is set to False")
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -3884,7 +3884,7 @@ class TokenizerTesterMixin:
def test_training_new_tokenizer(self):
# This feature only exists for fast tokenizers
if not self.test_rust_tokenizer:
return
self.skipTest(reason="test_rust_tokenizer is set to False")
tokenizer = self.get_rust_tokenizer()
new_tokenizer = tokenizer.train_new_from_iterator(SMALL_TRAINING_CORPUS, 100)
@@ -3919,7 +3919,7 @@ class TokenizerTesterMixin:
def test_training_new_tokenizer_with_special_tokens_change(self):
# This feature only exists for fast tokenizers
if not self.test_rust_tokenizer:
return
self.skipTest(reason="test_rust_tokenizer is set to False")
tokenizer = self.get_rust_tokenizer()
# Test with a special tokens map
@@ -4092,7 +4092,7 @@ class TokenizerTesterMixin:
def test_save_slow_from_fast_and_reload_fast(self):
if not self.test_slow_tokenizer or not self.test_rust_tokenizer:
# we need both slow and fast versions
return
self.skipTest(reason="test_rust_tokenizer or test_slow_tokenizer is set to False")
for tokenizer, pretrained_name, kwargs in self.tokenizers_list:
with self.subTest(f"{tokenizer.__class__.__name__} ({pretrained_name})"):
@@ -4166,7 +4166,7 @@ class TokenizerTesterMixin:
def test_split_special_tokens(self):
if not self.test_slow_tokenizer:
return
self.skipTest(reason="test_slow_tokenizer is set to False")
# Tests the expected appearance (or absence) of special token in encoded output,
# explicit values are not tested because tokenization is model dependent and can change
for tokenizer, pretrained_name, kwargs in self.tokenizers_list: