Fix FillMaskPipelineTests (#22894)
* fix --------- Co-authored-by: ydshieh <ydshieh@users.noreply.github.com>
This commit is contained in:
@@ -394,11 +394,7 @@ class FlaubertModelTest(ModelTesterMixin, PipelineTesterMixin, unittest.TestCase
|
|||||||
def is_pipeline_test_to_skip(
|
def is_pipeline_test_to_skip(
|
||||||
self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
|
self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
|
||||||
):
|
):
|
||||||
if pipeline_test_casse_name == "FillMaskPipelineTests":
|
if (
|
||||||
# Get `ValueError: AttributeError: 'NoneType' object has no attribute 'new_ones'` or `AssertionError`.
|
|
||||||
# `FlaubertConfig` was never used in pipeline tests: cannot create a simple tokenizer
|
|
||||||
return True
|
|
||||||
elif (
|
|
||||||
pipeline_test_casse_name == "QAPipelineTests"
|
pipeline_test_casse_name == "QAPipelineTests"
|
||||||
and tokenizer_name is not None
|
and tokenizer_name is not None
|
||||||
and not tokenizer_name.endswith("Fast")
|
and not tokenizer_name.endswith("Fast")
|
||||||
|
|||||||
@@ -310,11 +310,7 @@ class TFFlaubertModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.Test
|
|||||||
def is_pipeline_test_to_skip(
|
def is_pipeline_test_to_skip(
|
||||||
self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
|
self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
|
||||||
):
|
):
|
||||||
if pipeline_test_casse_name == "FillMaskPipelineTests":
|
if (
|
||||||
# Get `ValueError: AttributeError: 'NoneType' object has no attribute 'new_ones'` or `AssertionError`.
|
|
||||||
# `FlaubertConfig` was never used in pipeline tests: cannot create a simple tokenizer
|
|
||||||
return True
|
|
||||||
elif (
|
|
||||||
pipeline_test_casse_name == "QAPipelineTests"
|
pipeline_test_casse_name == "QAPipelineTests"
|
||||||
and tokenizer_name is not None
|
and tokenizer_name is not None
|
||||||
and not tokenizer_name.endswith("Fast")
|
and not tokenizer_name.endswith("Fast")
|
||||||
|
|||||||
@@ -313,11 +313,7 @@ class TFXLMModelTest(TFModelTesterMixin, PipelineTesterMixin, unittest.TestCase)
|
|||||||
def is_pipeline_test_to_skip(
|
def is_pipeline_test_to_skip(
|
||||||
self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
|
self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
|
||||||
):
|
):
|
||||||
if pipeline_test_casse_name == "FillMaskPipelineTests":
|
if (
|
||||||
# Get `ValueError: AttributeError: 'NoneType' object has no attribute 'new_ones'` or `AssertionError`.
|
|
||||||
# `XLMConfig` was never used in pipeline tests: cannot create a simple tokenizer
|
|
||||||
return True
|
|
||||||
elif (
|
|
||||||
pipeline_test_casse_name == "QAPipelineTests"
|
pipeline_test_casse_name == "QAPipelineTests"
|
||||||
and tokenizer_name is not None
|
and tokenizer_name is not None
|
||||||
and not tokenizer_name.endswith("Fast")
|
and not tokenizer_name.endswith("Fast")
|
||||||
|
|||||||
@@ -395,11 +395,7 @@ class XLMModelTest(ModelTesterMixin, GenerationTesterMixin, PipelineTesterMixin,
|
|||||||
def is_pipeline_test_to_skip(
|
def is_pipeline_test_to_skip(
|
||||||
self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
|
self, pipeline_test_casse_name, config_class, model_architecture, tokenizer_name, processor_name
|
||||||
):
|
):
|
||||||
if pipeline_test_casse_name == "FillMaskPipelineTests":
|
if (
|
||||||
# Get `ValueError: AttributeError: 'NoneType' object has no attribute 'new_ones'` or `AssertionError`.
|
|
||||||
# `XLMConfig` was never used in pipeline tests: cannot create a simple tokenizer
|
|
||||||
return True
|
|
||||||
elif (
|
|
||||||
pipeline_test_casse_name == "QAPipelineTests"
|
pipeline_test_casse_name == "QAPipelineTests"
|
||||||
and tokenizer_name is not None
|
and tokenizer_name is not None
|
||||||
and not tokenizer_name.endswith("Fast")
|
and not tokenizer_name.endswith("Fast")
|
||||||
|
|||||||
@@ -302,7 +302,8 @@ class FillMaskPipelineTests(unittest.TestCase):
|
|||||||
)
|
)
|
||||||
target_ids = {vocab[el] for el in targets}
|
target_ids = {vocab[el] for el in targets}
|
||||||
self.assertEqual({el["token"] for el in outputs}, target_ids)
|
self.assertEqual({el["token"] for el in outputs}, target_ids)
|
||||||
self.assertEqual({el["token_str"] for el in outputs}, set(targets))
|
processed_targets = [tokenizer.decode([x]) for x in target_ids]
|
||||||
|
self.assertEqual({el["token_str"] for el in outputs}, set(processed_targets))
|
||||||
|
|
||||||
# Call argument
|
# Call argument
|
||||||
fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer)
|
fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer)
|
||||||
@@ -316,24 +317,29 @@ class FillMaskPipelineTests(unittest.TestCase):
|
|||||||
)
|
)
|
||||||
target_ids = {vocab[el] for el in targets}
|
target_ids = {vocab[el] for el in targets}
|
||||||
self.assertEqual({el["token"] for el in outputs}, target_ids)
|
self.assertEqual({el["token"] for el in outputs}, target_ids)
|
||||||
self.assertEqual({el["token_str"] for el in outputs}, set(targets))
|
processed_targets = [tokenizer.decode([x]) for x in target_ids]
|
||||||
|
self.assertEqual({el["token_str"] for el in outputs}, set(processed_targets))
|
||||||
|
|
||||||
# Score equivalence
|
# Score equivalence
|
||||||
outputs = fill_masker(f"This is a {tokenizer.mask_token}", targets=targets)
|
outputs = fill_masker(f"This is a {tokenizer.mask_token}", targets=targets)
|
||||||
tokens = [top_mask["token_str"] for top_mask in outputs]
|
tokens = [top_mask["token_str"] for top_mask in outputs]
|
||||||
scores = [top_mask["score"] for top_mask in outputs]
|
scores = [top_mask["score"] for top_mask in outputs]
|
||||||
|
|
||||||
unmasked_targets = fill_masker(f"This is a {tokenizer.mask_token}", targets=tokens)
|
# For some BPE tokenizers, `</w>` is removed during decoding, so `token_str` won't be the same as in `targets`.
|
||||||
target_scores = [top_mask["score"] for top_mask in unmasked_targets]
|
if set(tokens) == set(targets):
|
||||||
self.assertEqual(nested_simplify(scores), nested_simplify(target_scores))
|
unmasked_targets = fill_masker(f"This is a {tokenizer.mask_token}", targets=tokens)
|
||||||
|
target_scores = [top_mask["score"] for top_mask in unmasked_targets]
|
||||||
|
self.assertEqual(nested_simplify(scores), nested_simplify(target_scores))
|
||||||
|
|
||||||
# Raises with invalid
|
# Raises with invalid
|
||||||
with self.assertRaises(ValueError):
|
|
||||||
outputs = fill_masker(f"This is a {tokenizer.mask_token}", targets=[""])
|
|
||||||
with self.assertRaises(ValueError):
|
with self.assertRaises(ValueError):
|
||||||
outputs = fill_masker(f"This is a {tokenizer.mask_token}", targets=[])
|
outputs = fill_masker(f"This is a {tokenizer.mask_token}", targets=[])
|
||||||
with self.assertRaises(ValueError):
|
# For some tokenizers, `""` is actually in the vocabulary and the expected error won't raised
|
||||||
outputs = fill_masker(f"This is a {tokenizer.mask_token}", targets="")
|
if "" not in tokenizer.get_vocab():
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
outputs = fill_masker(f"This is a {tokenizer.mask_token}", targets=[""])
|
||||||
|
with self.assertRaises(ValueError):
|
||||||
|
outputs = fill_masker(f"This is a {tokenizer.mask_token}", targets="")
|
||||||
|
|
||||||
def run_test_top_k(self, model, tokenizer):
|
def run_test_top_k(self, model, tokenizer):
|
||||||
fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer, top_k=2)
|
fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer, top_k=2)
|
||||||
@@ -368,10 +374,11 @@ class FillMaskPipelineTests(unittest.TestCase):
|
|||||||
# If we use the most probably targets, and filter differently, we should still
|
# If we use the most probably targets, and filter differently, we should still
|
||||||
# have the same results
|
# have the same results
|
||||||
targets2 = [el["token_str"] for el in sorted(outputs, key=lambda x: x["score"], reverse=True)]
|
targets2 = [el["token_str"] for el in sorted(outputs, key=lambda x: x["score"], reverse=True)]
|
||||||
outputs2 = fill_masker(f"This is a {tokenizer.mask_token}", top_k=3, targets=targets2)
|
# For some BPE tokenizers, `</w>` is removed during decoding, so `token_str` won't be the same as in `targets`.
|
||||||
|
if set(targets2).issubset(targets):
|
||||||
# They should yield exactly the same result
|
outputs2 = fill_masker(f"This is a {tokenizer.mask_token}", top_k=3, targets=targets2)
|
||||||
self.assertEqual(nested_simplify(outputs), nested_simplify(outputs2))
|
# They should yield exactly the same result
|
||||||
|
self.assertEqual(nested_simplify(outputs), nested_simplify(outputs2))
|
||||||
|
|
||||||
def fill_mask_with_duplicate_targets_and_top_k(self, model, tokenizer):
|
def fill_mask_with_duplicate_targets_and_top_k(self, model, tokenizer):
|
||||||
fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer)
|
fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer)
|
||||||
|
|||||||
Reference in New Issue
Block a user