Optimizing away the fill-mask pipeline. (#12113)

* Optimizing away the `fill-mask` pipeline. - Don't send anything to the tokenizer unless needed. Vocab check is much faster - Keep BC by sending data to the tokenizer when needed. User handling warning messages will see performance benefits again - Make `targets` and `top_k` work together better `top_k` cannot be higher than `len(targets)` but can be smaller still. - Actually simplify the `target_ids` in case of duplicate (it can happen because we're parsing raw strings) - Removed useless code to fail on empty strings. It works only if empty string is in first position, moved to ignoring them instead. - Changed the related tests as only the tests would fail correctly (having incorrect value in first position) * Make tests compatible for 2 different vocabs... (at the price of a warning). Co-authored-by: @EtaoinWu * ValueError working globally * Update src/transformers/pipelines/fill_mask.py Co-authored-by: Lysandre Debut <lysandre@huggingface.co> * `tokenizer.vocab` -> `tokenizer.get_vocab()` for more compatiblity + fallback. Co-authored-by: Lysandre Debut <lysandre@huggingface.co>
2021-06-23 10:38:04 +02:00
parent 037e466b10
commit d4be498441
2 changed files with 81 additions and 30 deletions
--- a/tests/test_pipelines_fill_mask.py
+++ b/tests/test_pipelines_fill_mask.py
@@ -78,7 +78,8 @@ class FillMaskPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
    @require_torch
    def test_torch_fill_mask_with_targets(self):
        valid_inputs = ["My name is <mask>"]
-        valid_targets = [[" Teven", " Patrick", " Clara"], [" Sam"]]
+        # ' Sam' will yield a warning but work
+        valid_targets = [[" Teven", "ĠPatrick", "ĠClara"], ["ĠSam"], [" Sam"]]
        invalid_targets = [[], [""], ""]
        for model_name in self.small_models:
            unmasker = pipeline(task="fill-mask", model=model_name, tokenizer=model_name, framework="pt")
@@ -89,10 +90,34 @@ class FillMaskPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
            for targets in invalid_targets:
                self.assertRaises(ValueError, unmasker, valid_inputs, targets=targets)

+    @require_torch
+    def test_torch_fill_mask_with_targets_and_topk(self):
+        model_name = self.small_models[0]
+        unmasker = pipeline(task="fill-mask", model=model_name, tokenizer=model_name, framework="pt")
+        targets = [" Teven", "ĠPatrick", "ĠClara"]
+        top_k = 2
+        outputs = unmasker("My name is <mask>", targets=targets, top_k=top_k)
+
+        self.assertEqual(len(outputs), 2)
+
+    @require_torch
+    def test_torch_fill_mask_with_duplicate_targets_and_topk(self):
+        model_name = self.small_models[0]
+        unmasker = pipeline(task="fill-mask", model=model_name, tokenizer=model_name, framework="pt")
+        # String duplicates + id duplicates
+        targets = [" Teven", "ĠPatrick", "ĠClara", "ĠClara", " Clara"]
+        top_k = 10
+        outputs = unmasker("My name is <mask>", targets=targets, top_k=top_k)
+
+        # The target list contains duplicates, so we can't output more
+        # than them
+        self.assertEqual(len(outputs), 3)
+
    @require_tf
    def test_tf_fill_mask_with_targets(self):
        valid_inputs = ["My name is <mask>"]
-        valid_targets = [[" Teven", " Patrick", " Clara"], [" Sam"]]
+        # ' Sam' will yield a warning but work
+        valid_targets = [[" Teven", "ĠPatrick", "ĠClara"], ["ĠSam"], [" Sam"]]
        invalid_targets = [[], [""], ""]
        for model_name in self.small_models:
            unmasker = pipeline(task="fill-mask", model=model_name, tokenizer=model_name, framework="tf")
@@ -111,7 +136,7 @@ class FillMaskPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
            "My name is <mask>",
            "The largest city in France is <mask>",
        ]
-        valid_targets = [" Patrick", " Clara"]
+        valid_targets = ["ĠPatrick", "ĠClara"]
        for model_name in self.large_models:
            unmasker = pipeline(
                task="fill-mask",
@@ -184,7 +209,7 @@ class FillMaskPipelineTests(MonoInputPipelineCommonMixin, unittest.TestCase):
            "My name is <mask>",
            "The largest city in France is <mask>",
        ]
-        valid_targets = [" Patrick", " Clara"]
+        valid_targets = ["ĠPatrick", "ĠClara"]
        for model_name in self.large_models:
            unmasker = pipeline(task="fill-mask", model=model_name, tokenizer=model_name, framework="tf", top_k=2)