Adding support for multiple mask tokens. (#14716)

* Adding support for multiple mask tokens.

- Original implem: https://github.com/huggingface/transformers/pull/10222

Co-authored-by: njafer <naveen.jafer@oracle.com>

* In order to accomodate optionally multimodal models like Perceiver

we add information to the tasks to specify tasks where we know for sure
if we need the tokenizer/feature_extractor or not.

* Adding info in the documentation about multi masks.

+ marked as experimental.

* Add a copy() to prevent overriding the same tensor over and over.

* Fixup.

* Adding small test for multi mask with real values..

Co-authored-by: njafer <naveen.jafer@oracle.com>
This commit is contained in:
Nicolas Patry
2021-12-14 16:46:16 +01:00
committed by GitHub
parent 2a606f9974
commit e7ed7ffdcb
3 changed files with 116 additions and 36 deletions

View File

@@ -104,6 +104,32 @@ class FillMaskPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
],
)
outputs = unmasker("My name is <mask> <mask>", top_k=2)
self.assertEqual(
nested_simplify(outputs, decimals=6),
[
[
{
"score": 2.2e-05,
"token": 35676,
"token_str": " Maul",
"sequence": "<s>My name is Maul<mask></s>",
},
{"score": 2.2e-05, "token": 16416, "token_str": "ELS", "sequence": "<s>My name isELS<mask></s>"},
],
[
{
"score": 2.2e-05,
"token": 35676,
"token_str": " Maul",
"sequence": "<s>My name is<mask> Maul</s>",
},
{"score": 2.2e-05, "token": 16416, "token_str": "ELS", "sequence": "<s>My name is<mask>ELS</s>"},
],
],
)
@slow
@require_torch
def test_large_model_pt(self):
@@ -231,9 +257,6 @@ class FillMaskPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
with self.assertRaises(ValueError):
fill_masker([None])
# Multiple masks
with self.assertRaises(PipelineException):
fill_masker(f"This is {tokenizer.mask_token} {tokenizer.mask_token}")
# No mask_token is not supported
with self.assertRaises(PipelineException):
fill_masker("This is")
@@ -242,6 +265,7 @@ class FillMaskPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
self.run_test_targets(model, tokenizer)
self.run_test_top_k_targets(model, tokenizer)
self.fill_mask_with_duplicate_targets_and_top_k(model, tokenizer)
self.fill_mask_with_multiple_masks(model, tokenizer)
def run_test_targets(self, model, tokenizer):
vocab = tokenizer.get_vocab()
@@ -340,3 +364,27 @@ class FillMaskPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
# The target list contains duplicates, so we can't output more
# than them
self.assertEqual(len(outputs), 3)
def fill_mask_with_multiple_masks(self, model, tokenizer):
fill_masker = FillMaskPipeline(model=model, tokenizer=tokenizer)
outputs = fill_masker(
f"This is a {tokenizer.mask_token} {tokenizer.mask_token} {tokenizer.mask_token}", top_k=2
)
self.assertEqual(
outputs,
[
[
{"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
{"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
],
[
{"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
{"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
],
[
{"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
{"sequence": ANY(str), "score": ANY(float), "token": ANY(int), "token_str": ANY(str)},
],
],
)