Attention mask is important in the case of batching... (#16222)

* Attention mask is important in the case of batching... * Improve the fix. * Making the sentence different enough that they exhibit different predictions.
2022-03-18 10:02:12 +01:00
parent ec4e421b7d
commit ecb4662d17
3 changed files with 18 additions and 2 deletions
--- a/tests/pipelines/test_pipelines_token_classification.py
+++ b/tests/pipelines/test_pipelines_token_classification.py
@@ -649,6 +649,23 @@ class TokenClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest
            ],
        )

+        # Batch size does not affect outputs (attention_mask are required)
+        sentences = ["This is a test !", "Another test this is with longer sentence"]
+        outputs = token_classifier(sentences)
+        outputs_batched = token_classifier(sentences, batch_size=2)
+        # Batching does not make a difference in predictions
+        self.assertEqual(nested_simplify(outputs_batched), nested_simplify(outputs))
+        self.assertEqual(
+            nested_simplify(outputs_batched),
+            [
+                [
+                    {"entity": "I-MISC", "score": 0.115, "index": 1, "word": "this", "start": 0, "end": 4},
+                    {"entity": "I-MISC", "score": 0.115, "index": 2, "word": "is", "start": 5, "end": 7},
+                ],
+                [],
+            ],
+        )
+
    @require_torch
    def test_pt_ignore_subwords_slow_tokenizer_raises(self):
        model_name = "sshleifer/tiny-dbmdz-bert-large-cased-finetuned-conll03-english"