Update pipeline word heuristic to work with whitespace in token offsets (#18402)

* Update pipeline word heuristic to work with whitespace in token offsets

This change checks for whitespace in the input string at either the
character preceding the token or in the first character of the token.
This works with tokenizers that return offsets excluding whitespace
between words or with offsets including whitespace.

fixes #18111

starting

* Use smaller model, ensure expected tokenization

* Re-run CI (please squash)
This commit is contained in:
David
2022-08-02 14:31:01 -05:00
committed by GitHub
parent c382ed8a2f
commit 042f420364
2 changed files with 50 additions and 1 deletions

View File

@@ -291,7 +291,7 @@ class TokenClassificationPipeline(Pipeline):
AggregationStrategy.MAX, AggregationStrategy.MAX,
}: }:
warnings.warn("Tokenizer does not support real words, using fallback heuristic", UserWarning) warnings.warn("Tokenizer does not support real words, using fallback heuristic", UserWarning)
is_subword = sentence[start_ind - 1 : start_ind] != " " if start_ind > 0 else False is_subword = start_ind > 0 and " " not in sentence[start_ind - 1 : start_ind + 1]
if int(input_ids[idx]) == self.tokenizer.unk_token_id: if int(input_ids[idx]) == self.tokenizer.unk_token_id:
word = word_ref word = word_ref

View File

@@ -535,6 +535,20 @@ class TokenClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest
[{"entity_group": "PER", "score": 0.35, "word": "Ramazotti", "start": 0, "end": 13}], [{"entity_group": "PER", "score": 0.35, "word": "Ramazotti", "start": 0, "end": 13}],
) )
@require_torch
@slow
def test_aggregation_strategy_offsets_with_leading_space(self):
sentence = "We're from New York"
model_name = "brandon25/deberta-base-finetuned-ner"
ner = pipeline("ner", model=model_name, ignore_labels=[], aggregation_strategy="max")
self.assertEqual(
nested_simplify(ner(sentence)),
[
{"entity_group": "O", "score": 1.0, "word": " We're from", "start": 0, "end": 10},
{"entity_group": "LOC", "score": 1.0, "word": " New York", "start": 10, "end": 19},
],
)
@require_torch @require_torch
def test_gather_pre_entities(self): def test_gather_pre_entities(self):
model_name = "sshleifer/tiny-dbmdz-bert-large-cased-finetuned-conll03-english" model_name = "sshleifer/tiny-dbmdz-bert-large-cased-finetuned-conll03-english"
@@ -580,6 +594,41 @@ class TokenClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest
], ],
) )
@require_torch
def test_word_heuristic_leading_space(self):
model_name = "hf-internal-testing/tiny-random-deberta-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True)
token_classifier = pipeline(task="ner", model=model_name, tokenizer=tokenizer, framework="pt")
sentence = "I play the theremin"
tokens = tokenizer(
sentence,
return_attention_mask=False,
return_tensors="pt",
return_special_tokens_mask=True,
return_offsets_mapping=True,
)
offset_mapping = tokens.pop("offset_mapping").cpu().numpy()[0]
special_tokens_mask = tokens.pop("special_tokens_mask").cpu().numpy()[0]
input_ids = tokens["input_ids"].numpy()[0]
scores = np.array([[1, 0] for _ in input_ids]) # values irrelevant for heuristic
pre_entities = token_classifier.gather_pre_entities(
sentence,
input_ids,
scores,
offset_mapping,
special_tokens_mask,
aggregation_strategy=AggregationStrategy.FIRST,
)
# ensure expected tokenization and correct is_subword values
self.assertEqual(
[(entity["word"], entity["is_subword"]) for entity in pre_entities],
[("▁I", False), ("▁play", False), ("▁the", False), ("▁there", False), ("min", True)],
)
@require_tf @require_tf
def test_tf_only(self): def test_tf_only(self):
model_name = "hf-internal-testing/tiny-random-bert-tf-only" # This model only has a TensorFlow version model_name = "hf-internal-testing/tiny-random-bert-tf-only" # This model only has a TensorFlow version