From 18d233d52588b4e08dc785fbfecd77529e9effa6 Mon Sep 17 00:00:00 2001
From: Lorenzo Ampil <lorenzojulioampil@gmail.com>
Date: Sun, 17 May 2020 15:25:17 +0800
Subject: [PATCH] Allow the creation of "entity groups" for NerPipeline #3548
 (#3957)

* Add index to be returned by NerPipeline to allow for the creation of

* Add entity groups

* Convert entity list to dict

* Add entity to entity_group_disagg atfter updating entity gorups

* Change 'group' parameter to 'grouped_entities'

* Add unit tests for grouped NER pipeline case

* Correct variable name typo for NER_FINETUNED_MODELS

* Sync grouped tests to recent test updates
---
 src/transformers/pipelines.py | 75 ++++++++++++++++++++++++++++++-----
 tests/test_pipelines.py       | 16 ++++++++
 2 files changed, 80 insertions(+), 11 deletions(-)

diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py
index 36bf137dcf..c193145715 100755
--- a/src/transformers/pipelines.py
+++ b/src/transformers/pipelines.py
@@ -868,6 +868,7 @@ class NerPipeline(Pipeline):
         binary_output: bool = False,
         ignore_labels=["O"],
         task: str = "",
+        grouped_entities: bool = False,
     ):
         super().__init__(
             model=model,
@@ -882,6 +883,7 @@ class NerPipeline(Pipeline):
 
         self._basic_tokenizer = BasicTokenizer(do_lower_case=False)
         self.ignore_labels = ignore_labels
+        self.grouped_entities = grouped_entities
 
     def __call__(self, *args, **kwargs):
         inputs = self._args_parser(*args, **kwargs)
@@ -911,23 +913,74 @@ class NerPipeline(Pipeline):
             score = np.exp(entities) / np.exp(entities).sum(-1, keepdims=True)
             labels_idx = score.argmax(axis=-1)
 
-            answer = []
-            for idx, label_idx in enumerate(labels_idx):
-                if self.model.config.id2label[label_idx] not in self.ignore_labels:
-                    answer += [
-                        {
-                            "word": self.tokenizer.convert_ids_to_tokens(int(input_ids[idx])),
-                            "score": score[idx][label_idx].item(),
-                            "entity": self.model.config.id2label[label_idx],
-                        }
-                    ]
+            entities = []
+            entity_groups = []
+            entity_group_disagg = []
+            # Filter to labels not in `self.ignore_labels`
+            filtered_labels_idx = [
+                (idx, label_idx)
+                for idx, label_idx in enumerate(labels_idx)
+                if self.model.config.id2label[label_idx] not in self.ignore_labels
+            ]
+
+            for idx, label_idx in filtered_labels_idx:
+
+                entity = {
+                    "word": self.tokenizer.convert_ids_to_tokens(int(input_ids[idx])),
+                    "score": score[idx][label_idx].item(),
+                    "entity": self.model.config.id2label[label_idx],
+                    "index": idx,
+                }
+                last_idx, _ = filtered_labels_idx[-1]
+                if self.grouped_entities:
+                    if not entity_group_disagg:
+                        entity_group_disagg += [entity]
+                        if idx == last_idx:
+                            entity_groups += [self.group_entities(entity_group_disagg)]
+                        continue
+
+                    # If the current entity is similar and adjacent to the previous entity, append it to the disaggregated entity group
+                    if (
+                        entity["entity"] == entity_group_disagg[-1]["entity"]
+                        and entity["index"] == entity_group_disagg[-1]["index"] + 1
+                    ):
+                        entity_group_disagg += [entity]
+                        # Group the entities at the last entity
+                        if idx == last_idx:
+                            entity_groups += [self.group_entities(entity_group_disagg)]
+                    # If the current entity is different from the previous entity, aggregate the disaggregated entity group
+                    else:
+                        entity_groups += [self.group_entities(entity_group_disagg)]
+                        entity_group_disagg = [entity]
+
+                entities += [entity]
 
             # Append
-            answers += [answer]
+            if self.grouped_entities:
+                answers += [entity_groups]
+            else:
+                answers += [entities]
+
         if len(answers) == 1:
             return answers[0]
         return answers
 
+    def group_entities(self, entities):
+        """
+        Returns grouped entities
+        """
+        # Get the last entity in the entity group
+        entity = entities[-1]["entity"]
+        scores = np.mean([entity["score"] for entity in entities])
+        tokens = [entity["word"] for entity in entities]
+
+        entity_group = {
+            "entity_group": entity,
+            "score": np.mean(scores),
+            "word": self.tokenizer.convert_tokens_to_string(tokens),
+        }
+        return entity_group
+
 
 TokenClassificationPipeline = NerPipeline
 
diff --git a/tests/test_pipelines.py b/tests/test_pipelines.py
index 1365365166..66de91e5df 100644
--- a/tests/test_pipelines.py
+++ b/tests/test_pipelines.py
@@ -160,6 +160,14 @@ class MonoColumnInputTestCase(unittest.TestCase):
             nlp = pipeline(task="ner", model=model_name, tokenizer=model_name)
             self._test_mono_column_pipeline(nlp, valid_inputs, mandatory_keys)
 
+    @require_torch
+    def test_ner_grouped(self):
+        mandatory_keys = {"entity_group", "word", "score"}
+        valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"]
+        for model_name in NER_FINETUNED_MODELS:
+            nlp = pipeline(task="ner", model=model_name, tokenizer=model_name, grouped_entities=True)
+            self._test_mono_column_pipeline(nlp, valid_inputs, mandatory_keys)
+
     @require_tf
     def test_tf_ner(self):
         mandatory_keys = {"entity", "word", "score"}
@@ -168,6 +176,14 @@ class MonoColumnInputTestCase(unittest.TestCase):
             nlp = pipeline(task="ner", model=model_name, tokenizer=model_name, framework="tf")
             self._test_mono_column_pipeline(nlp, valid_inputs, mandatory_keys)
 
+    @require_tf
+    def test_tf_ner_grouped(self):
+        mandatory_keys = {"entity_group", "word", "score"}
+        valid_inputs = ["HuggingFace is solving NLP one commit at a time.", "HuggingFace is based in New-York & Paris"]
+        for model_name in NER_FINETUNED_MODELS:
+            nlp = pipeline(task="ner", model=model_name, tokenizer=model_name, framework="tf", grouped_entities=True)
+            self._test_mono_column_pipeline(nlp, valid_inputs, mandatory_keys)
+
     @require_torch
     def test_torch_sentiment_analysis(self):
         mandatory_keys = {"label", "score"}