From 042f420364fe14d0b6731028e6e7884062abeccd Mon Sep 17 00:00:00 2001 From: David Date: Tue, 2 Aug 2022 14:31:01 -0500 Subject: [PATCH] Update pipeline word heuristic to work with whitespace in token offsets (#18402) * Update pipeline word heuristic to work with whitespace in token offsets This change checks for whitespace in the input string at either the character preceding the token or in the first character of the token. This works with tokenizers that return offsets excluding whitespace between words or with offsets including whitespace. fixes #18111 starting * Use smaller model, ensure expected tokenization * Re-run CI (please squash) --- .../pipelines/token_classification.py | 2 +- .../test_pipelines_token_classification.py | 49 +++++++++++++++++++ 2 files changed, 50 insertions(+), 1 deletion(-) diff --git a/src/transformers/pipelines/token_classification.py b/src/transformers/pipelines/token_classification.py index 72f0c5c9c7..04a80b32dd 100644 --- a/src/transformers/pipelines/token_classification.py +++ b/src/transformers/pipelines/token_classification.py @@ -291,7 +291,7 @@ class TokenClassificationPipeline(Pipeline): AggregationStrategy.MAX, }: warnings.warn("Tokenizer does not support real words, using fallback heuristic", UserWarning) - is_subword = sentence[start_ind - 1 : start_ind] != " " if start_ind > 0 else False + is_subword = start_ind > 0 and " " not in sentence[start_ind - 1 : start_ind + 1] if int(input_ids[idx]) == self.tokenizer.unk_token_id: word = word_ref diff --git a/tests/pipelines/test_pipelines_token_classification.py b/tests/pipelines/test_pipelines_token_classification.py index 45916ec31d..1d71529cdf 100644 --- a/tests/pipelines/test_pipelines_token_classification.py +++ b/tests/pipelines/test_pipelines_token_classification.py @@ -535,6 +535,20 @@ class TokenClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest [{"entity_group": "PER", "score": 0.35, "word": "Ramazotti", "start": 0, "end": 13}], ) + @require_torch + @slow + def test_aggregation_strategy_offsets_with_leading_space(self): + sentence = "We're from New York" + model_name = "brandon25/deberta-base-finetuned-ner" + ner = pipeline("ner", model=model_name, ignore_labels=[], aggregation_strategy="max") + self.assertEqual( + nested_simplify(ner(sentence)), + [ + {"entity_group": "O", "score": 1.0, "word": " We're from", "start": 0, "end": 10}, + {"entity_group": "LOC", "score": 1.0, "word": " New York", "start": 10, "end": 19}, + ], + ) + @require_torch def test_gather_pre_entities(self): model_name = "sshleifer/tiny-dbmdz-bert-large-cased-finetuned-conll03-english" @@ -580,6 +594,41 @@ class TokenClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest ], ) + @require_torch + def test_word_heuristic_leading_space(self): + model_name = "hf-internal-testing/tiny-random-deberta-v2" + tokenizer = AutoTokenizer.from_pretrained(model_name, use_fast=True) + token_classifier = pipeline(task="ner", model=model_name, tokenizer=tokenizer, framework="pt") + + sentence = "I play the theremin" + + tokens = tokenizer( + sentence, + return_attention_mask=False, + return_tensors="pt", + return_special_tokens_mask=True, + return_offsets_mapping=True, + ) + offset_mapping = tokens.pop("offset_mapping").cpu().numpy()[0] + special_tokens_mask = tokens.pop("special_tokens_mask").cpu().numpy()[0] + input_ids = tokens["input_ids"].numpy()[0] + scores = np.array([[1, 0] for _ in input_ids]) # values irrelevant for heuristic + + pre_entities = token_classifier.gather_pre_entities( + sentence, + input_ids, + scores, + offset_mapping, + special_tokens_mask, + aggregation_strategy=AggregationStrategy.FIRST, + ) + + # ensure expected tokenization and correct is_subword values + self.assertEqual( + [(entity["word"], entity["is_subword"]) for entity in pre_entities], + [("▁I", False), ("▁play", False), ("▁the", False), ("▁there", False), ("min", True)], + ) + @require_tf def test_tf_only(self): model_name = "hf-internal-testing/tiny-random-bert-tf-only" # This model only has a TensorFlow version