From 18d233d52588b4e08dc785fbfecd77529e9effa6 Mon Sep 17 00:00:00 2001 From: Lorenzo Ampil Date: Sun, 17 May 2020 15:25:17 +0800 Subject: [PATCH] Allow the creation of "entity groups" for NerPipeline #3548 (#3957) * Add index to be returned by NerPipeline to allow for the creation of * Add entity groups * Convert entity list to dict * Add entity to entity_group_disagg atfter updating entity gorups * Change 'group' parameter to 'grouped_entities' * Add unit tests for grouped NER pipeline case * Correct variable name typo for NER_FINETUNED_MODELS * Sync grouped tests to recent test updates --- src/transformers/pipelines.py | 75 ++++++++++++++++++++++++++++++----- tests/test_pipelines.py | 16 ++++++++ 2 files changed, 80 insertions(+), 11 deletions(-) diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py index 36bf137dcf..c193145715 100755 --- a/src/transformers/pipelines.py +++ b/src/transformers/pipelines.py @@ -868,6 +868,7 @@ class NerPipeline(Pipeline): binary_output: bool = False, ignore_labels=["O"], task: str = "", + grouped_entities: bool = False, ): super().__init__( model=model, @@ -882,6 +883,7 @@ class NerPipeline(Pipeline): self._basic_tokenizer = BasicTokenizer(do_lower_case=False) self.ignore_labels = ignore_labels + self.grouped_entities = grouped_entities def __call__(self, *args, **kwargs): inputs = self._args_parser(*args, **kwargs) @@ -911,23 +913,74 @@ class NerPipeline(Pipeline): score = np.exp(entities) / np.exp(entities).sum(-1, keepdims=True) labels_idx = score.argmax(axis=-1) - answer = [] - for idx, label_idx in enumerate(labels_idx): - if self.model.config.id2label[label_idx] not in self.ignore_labels: - answer += [ - { - "word": self.tokenizer.convert_ids_to_tokens(int(input_ids[idx])), - "score": score[idx][label_idx].item(), - "entity": self.model.config.id2label[label_idx], - } - ] + entities = [] + entity_groups = [] + entity_group_disagg = [] + # Filter to labels not in `self.ignore_labels` + filtered_labels_idx = [ + (idx, label_idx) + for idx, label_idx in enumerate(labels_idx) + if self.model.config.id2label[label_idx] not in self.ignore_labels + ] + + for idx, label_idx in filtered_labels_idx: + + entity = { + "word": self.tokenizer.convert_ids_to_tokens(int(input_ids[idx])), + "score": score[idx][label_idx].item(), + "entity": self.model.config.id2label[label_idx], + "index": idx, + } + last_idx, _ = filtered_labels_idx[-1] + if self.grouped_entities: + if not entity_group_disagg: + entity_group_disagg += [entity] + if idx == last_idx: + entity_groups += [self.group_entities(entity_group_disagg)] + continue + + # If the current entity is similar and adjacent to the previous entity, append it to the disaggregated entity group + if ( + entity["entity"] == entity_group_disagg[-1]["entity"] + and entity["index"] == entity_group_disagg[-1]["index"] + 1 + ): + entity_group_disagg += [entity] + # Group the entities at the last entity + if idx == last_idx: + entity_groups += [self.group_entities(entity_group_disagg)] + # If the current entity is different from the previous entity, aggregate the disaggregated entity group + else: + entity_groups += [self.group_entities(entity_group_disagg)] + entity_group_disagg = [entity] + + entities += [entity] # Append - answers += [answer] + if self.grouped_entities: + answers += [entity_groups] + else: + answers += [entities] + if len(answers) == 1: return answers[0] return answers + def group_entities(self, entities): + """ + Returns grouped entities + """ + # Get the last entity in the entity group + entity = entities[-1]["entity"] + scores = np.mean([entity["score"] for entity in entities]) + tokens = [entity["word"] for entity in entities] + + entity_group = { + "entity_group": entity, + "score": np.mean(scores), + "word": self.tokenizer.convert_tokens_to_string(tokens), + } + return entity_group + TokenClassificationPipeline = NerPipeline diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py index 1365365166..66de91e5df 100644 --- a/tests/test_pipelines.py +++ b/tests/test_pipelines.py @@ -160,6 +160,14 @@ class MonoColumnInputTestCase(unittest.TestCase): nlp = pipeline(task="ner", model=model_name, tokenizer=model_name) self._test_mono_column_pipeline(nlp, valid_inputs, mandatory_keys) + @require_torch + def test_ner_grouped(self): + mandatory_keys = {"entity_group", "word", "score"} + valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"] + for model_name in NER_FINETUNED_MODELS: + nlp = pipeline(task="ner", model=model_name, tokenizer=model_name, grouped_entities=True) + self._test_mono_column_pipeline(nlp, valid_inputs, mandatory_keys) + @require_tf def test_tf_ner(self): mandatory_keys = {"entity", "word", "score"} @@ -168,6 +176,14 @@ class MonoColumnInputTestCase(unittest.TestCase): nlp = pipeline(task="ner", model=model_name, tokenizer=model_name, framework="tf") self._test_mono_column_pipeline(nlp, valid_inputs, mandatory_keys) + @require_tf + def test_tf_ner_grouped(self): + mandatory_keys = {"entity_group", "word", "score"} + valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"] + for model_name in NER_FINETUNED_MODELS: + nlp = pipeline(task="ner", model=model_name, tokenizer=model_name, framework="tf", grouped_entities=True) + self._test_mono_column_pipeline(nlp, valid_inputs, mandatory_keys) + @require_torch def test_torch_sentiment_analysis(self): mandatory_keys = {"label", "score"}