From 2e12d90b9ed3005a3fa412566b899de2a229e033 Mon Sep 17 00:00:00 2001
From: Nicolas Patry <patry.nicolas@protonmail.com>
Date: Wed, 8 Dec 2021 09:54:24 +0100
Subject: [PATCH] Fixing Dataset for TQA + token-classification. (#14658)

* Fixing Dataset for TQA + token-classification.

* Fixing the tests.

* Making sure `offset_mappings` is a valid argument.
---
 .../pipelines/table_question_answering.py     |  5 ++-
 .../pipelines/token_classification.py         | 35 ++++++++++++-------
 tests/test_pipelines_common.py                |  7 ++--
 ...test_pipelines_table_question_answering.py | 18 +++++++---
 tests/test_pipelines_token_classification.py  | 13 +++++++
 5 files changed, 58 insertions(+), 20 deletions(-)

diff --git a/src/transformers/pipelines/table_question_answering.py b/src/transformers/pipelines/table_question_answering.py
index 1ec93d1160..3634fa8c69 100644
--- a/src/transformers/pipelines/table_question_answering.py
+++ b/src/transformers/pipelines/table_question_answering.py
@@ -1,4 +1,5 @@
 import collections
+import types
 
 import numpy as np
 
@@ -9,7 +10,7 @@ from ..file_utils import (
     is_torch_available,
     requires_backends,
 )
-from .base import PIPELINE_INIT_ARGS, ArgumentHandler, Pipeline, PipelineException
+from .base import PIPELINE_INIT_ARGS, ArgumentHandler, Dataset, Pipeline, PipelineException
 
 
 if is_torch_available():
@@ -58,6 +59,8 @@ class TableQuestionAnsweringArgumentHandler(ArgumentHandler):
                         f"If keyword argument `table` is a list of dictionaries, each dictionary should have a `table` "
                         f"and `query` key, but only dictionary has keys {table[0].keys()} `table` and `query` keys."
                     )
+            elif Dataset is not None and isinstance(table, Dataset) or isinstance(table, types.GeneratorType):
+                return table
             else:
                 raise ValueError(
                     f"Invalid input. Keyword argument `table` should be either of type `dict` or `list`, but "
diff --git a/src/transformers/pipelines/token_classification.py b/src/transformers/pipelines/token_classification.py
index 6324327dcd..9bfea4c378 100644
--- a/src/transformers/pipelines/token_classification.py
+++ b/src/transformers/pipelines/token_classification.py
@@ -1,3 +1,4 @@
+import types
 import warnings
 from typing import List, Optional, Tuple, Union
 
@@ -5,7 +6,7 @@ import numpy as np
 
 from ..file_utils import ExplicitEnum, add_end_docstrings, is_tf_available, is_torch_available
 from ..models.bert.tokenization_bert import BasicTokenizer
-from .base import PIPELINE_INIT_ARGS, ArgumentHandler, Pipeline
+from .base import PIPELINE_INIT_ARGS, ArgumentHandler, Dataset, Pipeline
 
 
 if is_tf_available():
@@ -28,6 +29,8 @@ class TokenClassificationArgumentHandler(ArgumentHandler):
         elif isinstance(inputs, str):
             inputs = [inputs]
             batch_size = 1
+        elif Dataset is not None and isinstance(inputs, Dataset) or isinstance(inputs, types.GeneratorType):
+            return inputs, None
         else:
             raise ValueError("At least one input is required.")
 
@@ -112,8 +115,13 @@ class TokenClassificationPipeline(Pipeline):
         grouped_entities: Optional[bool] = None,
         ignore_subwords: Optional[bool] = None,
         aggregation_strategy: Optional[AggregationStrategy] = None,
+        offset_mapping: Optional[List[Tuple[int, int]]] = None,
     ):
 
+        preprocess_params = {}
+        if offset_mapping is not None:
+            preprocess_params["offset_mapping"] = offset_mapping
+
         postprocess_params = {}
         if grouped_entities is not None or ignore_subwords is not None:
             if grouped_entities and ignore_subwords:
@@ -147,7 +155,7 @@ class TokenClassificationPipeline(Pipeline):
             postprocess_params["aggregation_strategy"] = aggregation_strategy
         if ignore_labels is not None:
             postprocess_params["ignore_labels"] = ignore_labels
-        return {}, {}, postprocess_params
+        return preprocess_params, {}, postprocess_params
 
     def __call__(self, inputs: Union[str, List[str]], **kwargs):
         """
@@ -174,12 +182,13 @@ class TokenClassificationPipeline(Pipeline):
               Only exists if the offsets are available within the tokenizer
         """
 
-        _inputs, offset_mappings = self._args_parser(inputs, **kwargs)
-        self.offset_mappings = offset_mappings
+        _inputs, offset_mapping = self._args_parser(inputs, **kwargs)
+        if offset_mapping:
+            kwargs["offset_mapping"] = offset_mapping
 
         return super().__call__(inputs, **kwargs)
 
-    def preprocess(self, sentence):
+    def preprocess(self, sentence, offset_mapping=None):
         truncation = True if self.tokenizer.model_max_length and self.tokenizer.model_max_length > 0 else False
         model_inputs = self.tokenizer(
             sentence,
@@ -189,8 +198,7 @@ class TokenClassificationPipeline(Pipeline):
             return_special_tokens_mask=True,
             return_offsets_mapping=self.tokenizer.is_fast,
         )
-        if self.offset_mappings:
-            offset_mapping = self.offset_mappings[0]
+        if offset_mapping:
             model_inputs["offset_mapping"] = offset_mapping
 
         model_inputs["sentence"] = sentence
@@ -262,12 +270,13 @@ class TokenClassificationPipeline(Pipeline):
             word = self.tokenizer.convert_ids_to_tokens(int(input_ids[idx]))
             if offset_mapping is not None:
                 start_ind, end_ind = offset_mapping[idx]
-                if self.framework == "pt":
-                    start_ind = start_ind.item()
-                    end_ind = end_ind.item()
-                else:
-                    start_ind = int(start_ind.numpy())
-                    end_ind = int(end_ind.numpy())
+                if not isinstance(start_ind, int):
+                    if self.framework == "pt":
+                        start_ind = start_ind.item()
+                        end_ind = end_ind.item()
+                    else:
+                        start_ind = int(start_ind.numpy())
+                        end_ind = int(end_ind.numpy())
                 word_ref = sentence[start_ind:end_ind]
                 if getattr(self.tokenizer._tokenizer.model, "continuing_subword_prefix", None):
                     # This is a BPE, word aware tokenizer, there is a correct way
diff --git a/tests/test_pipelines_common.py b/tests/test_pipelines_common.py
index 12cee76bf8..62a1831dc4 100644
--- a/tests/test_pipelines_common.py
+++ b/tests/test_pipelines_common.py
@@ -183,9 +183,12 @@ class PipelineTestCaseMeta(type):
 
                     # 10 examples with batch size 4 means there needs to be a unfinished batch
                     # which is important for the unbatcher
-                    dataset = [copy.deepcopy(random.choice(examples)) for i in range(10)]
+                    def data(n):
+                        for _ in range(n):
+                            # Need to copy because Conversation object is mutated
+                            yield copy.deepcopy(random.choice(examples))
 
-                    for item in pipeline(dataset, batch_size=4):
+                    for item in pipeline(data(10), batch_size=4):
                         pass
 
                 run_batch_test(pipeline, examples)
diff --git a/tests/test_pipelines_table_question_answering.py b/tests/test_pipelines_table_question_answering.py
index 789e92c3d7..0793d6586c 100644
--- a/tests/test_pipelines_table_question_answering.py
+++ b/tests/test_pipelines_table_question_answering.py
@@ -35,17 +35,16 @@ from transformers.testing_utils import (
 from .test_pipelines_common import PipelineTestCaseMeta
 
 
-@require_tensorflow_probability
-@require_torch_scatter
-@require_torch
-@require_pandas
 @is_pipeline_test
 class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
     # Putting it there for consistency, but TQA do not have fast tokenizer
     # which are needed to generate automatic tests
     model_mapping = MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING
 
+    @require_tensorflow_probability
+    @require_pandas
     @require_tf
+    @require_torch
     def test_small_model_tf(self):
         model_id = "lysandre/tiny-tapas-random-wtq"
         model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id, from_pt=True)
@@ -147,6 +146,7 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
             )
 
     @require_torch
+    @require_torch_scatter
     def test_small_model_pt(self):
         model_id = "lysandre/tiny-tapas-random-wtq"
         model = AutoModelForTableQuestionAnswering.from_pretrained(model_id)
@@ -248,6 +248,7 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
             )
 
     @require_torch
+    @require_torch_scatter
     def test_slow_tokenizer_sqa_pt(self):
         model_id = "lysandre/tiny-tapas-random-sqa"
         model = AutoModelForTableQuestionAnswering.from_pretrained(model_id)
@@ -366,6 +367,9 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
             )
 
     @require_tf
+    @require_tensorflow_probability
+    @require_pandas
+    @require_torch
     def test_slow_tokenizer_sqa_tf(self):
         model_id = "lysandre/tiny-tapas-random-sqa"
         model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id, from_pt=True)
@@ -484,6 +488,7 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
             )
 
     @slow
+    @require_torch_scatter
     def test_integration_wtq_pt(self):
         table_querier = pipeline("table-question-answering")
 
@@ -528,6 +533,8 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
         self.assertListEqual(results, expected_results)
 
     @slow
+    @require_tensorflow_probability
+    @require_pandas
     def test_integration_wtq_tf(self):
         model_id = "google/tapas-base-finetuned-wtq"
         model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id)
@@ -575,6 +582,7 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
         self.assertListEqual(results, expected_results)
 
     @slow
+    @require_torch_scatter
     def test_integration_sqa_pt(self):
         table_querier = pipeline(
             "table-question-answering",
@@ -598,6 +606,8 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
         self.assertListEqual(results, expected_results)
 
     @slow
+    @require_tensorflow_probability
+    @require_pandas
     def test_integration_sqa_tf(self):
         model_id = "google/tapas-base-finetuned-sqa"
         model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id)
diff --git a/tests/test_pipelines_token_classification.py b/tests/test_pipelines_token_classification.py
index 8c469e14ab..94ac7a19ce 100644
--- a/tests/test_pipelines_token_classification.py
+++ b/tests/test_pipelines_token_classification.py
@@ -636,6 +636,19 @@ class TokenClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest
             [],
         )
 
+        token_classifier = pipeline(task="token-classification", model=model_name, framework="pt")
+        # Overload offset_mapping
+        outputs = token_classifier(
+            "This is a test !", offset_mapping=[(0, 0), (0, 1), (0, 2), (0, 0), (0, 0), (0, 0), (0, 0)]
+        )
+        self.assertEqual(
+            nested_simplify(outputs),
+            [
+                {"entity": "I-MISC", "score": 0.115, "index": 1, "word": "this", "start": 0, "end": 1},
+                {"entity": "I-MISC", "score": 0.115, "index": 2, "word": "is", "start": 0, "end": 2},
+            ],
+        )
+
     @require_torch
     def test_pt_ignore_subwords_slow_tokenizer_raises(self):
         model_name = "sshleifer/tiny-dbmdz-bert-large-cased-finetuned-conll03-english"