Adding a new align_to_words param to qa pipeline. (#18010)

* Adding a new `align_to_words` param to qa pipeline. * Update src/transformers/pipelines/question_answering.py Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com> * Import protection. Co-authored-by: Sylvain Gugger <35901082+sgugger@users.noreply.github.com>
2022-08-09 18:50:02 +02:00
parent ab2006e3d6
commit 9f5fe63548
2 changed files with 61 additions and 10 deletions
--- a/tests/pipelines/test_pipelines_question_answering.py
+++ b/tests/pipelines/test_pipelines_question_answering.py
@@ -171,6 +171,29 @@ class QAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):

        self.assertEqual(nested_simplify(outputs), {"score": 0.028, "start": 0, "end": 11, "answer": "HuggingFace"})

+    @slow
+    @require_torch
+    def test_small_model_japanese(self):
+        question_answerer = pipeline(
+            "question-answering",
+            model="KoichiYasuoka/deberta-base-japanese-aozora-ud-head",
+        )
+        output = question_answerer(question="国語", context="全学年にわたって小学校の国語の教科書に挿し絵が用いられている")
+
+        # Wrong answer, the whole text is identified as one "word" since the tokenizer does not include
+        # a pretokenizer
+        self.assertEqual(
+            nested_simplify(output),
+            {"score": 1.0, "start": 0, "end": 30, "answer": "全学年にわたって小学校の国語の教科書に挿し絵が用いられている"},
+        )
+
+        # Disable word alignment
+        output = question_answerer(question="国語", context="全学年にわたって小学校の国語の教科書に挿し絵が用いられている", align_to_words=False)
+        self.assertEqual(
+            nested_simplify(output),
+            {"score": 1.0, "start": 15, "end": 18, "answer": "教科書"},
+        )
+
    @slow
    @require_torch
    def test_small_model_long_context_cls_slow(self):