Update pipeline word heuristic to work with whitespace in token offsets (#18402)

* Update pipeline word heuristic to work with whitespace in token offsets This change checks for whitespace in the input string at either the character preceding the token or in the first character of the token. This works with tokenizers that return offsets excluding whitespace between words or with offsets including whitespace. fixes #18111 starting * Use smaller model, ensure expected tokenization * Re-run CI (please squash)
2022-08-02 14:31:01 -05:00
parent c382ed8a2f
commit 042f420364
2 changed files with 50 additions and 1 deletions
--- a/tests/pipelines/test_pipelines_token_classification.py
+++ b/tests/pipelines/test_pipelines_token_classification.py
@@ -535,6 +535,20 @@ class TokenClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest
            [{"entity_group": "PER", "score": 0.35, "word": "Ramazotti", "start": 0, "end": 13}],
        )

+    @require_torch
+    @slow
+    def test_aggregation_strategy_offsets_with_leading_space(self):
+        sentence = "We're from New York"
+        model_name = "brandon25/deberta-base-finetuned-ner"
+        ner = pipeline("ner", model=model_name, ignore_labels=[], aggregation_strategy="max")
+        self.assertEqual(
+            nested_simplify(ner(sentence)),
+            [
+                {"entity_group": "O", "score": 1.0, "word": " We're from", "start": 0, "end": 10},
+                {"entity_group": "LOC", "score": 1.0, "word": " New York", "start": 10, "end": 19},
+            ],
+        )
+
    @require_torch
    def test_gather_pre_entities(self):
        model_name = "sshleifer/tiny-dbmdz-bert-large-cased-finetuned-conll03-english"
@@ -580,6 +594,41 @@ class TokenClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest
            ],
        )

+    @require_torch
+    def test_word_heuristic_leading_space(self):
+        model_name = "hf-internal-testing/tiny-random-deberta-v2"
+        tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
+        token_classifier = pipeline(task="ner", model=model_name, tokenizer=tokenizer, framework="pt")
+
+        sentence = "I play the theremin"
+
+        tokens = tokenizer(
+            sentence,
+            return_attention_mask=False,
+            return_tensors="pt",
+            return_special_tokens_mask=True,
+            return_offsets_mapping=True,
+        )
+        offset_mapping = tokens.pop("offset_mapping").cpu().numpy()[0]
+        special_tokens_mask = tokens.pop("special_tokens_mask").cpu().numpy()[0]
+        input_ids = tokens["input_ids"].numpy()[0]
+        scores = np.array([[1, 0] for _ in input_ids])  # values irrelevant for heuristic
+
+        pre_entities = token_classifier.gather_pre_entities(
+            sentence,
+            input_ids,
+            scores,
+            offset_mapping,
+            special_tokens_mask,
+            aggregation_strategy=AggregationStrategy.FIRST,
+        )
+
+        # ensure expected tokenization and correct is_subword values
+        self.assertEqual(
+            [(entity["word"], entity["is_subword"]) for entity in pre_entities],
+            [("▁I", False), ("▁play", False), ("▁the", False), ("▁there", False), ("min", True)],
+        )
+
    @require_tf
    def test_tf_only(self):
        model_name = "hf-internal-testing/tiny-random-bert-tf-only"  # This model only has a TensorFlow version