From 2e12d90b9ed3005a3fa412566b899de2a229e033 Mon Sep 17 00:00:00 2001 From: Nicolas Patry Date: Wed, 8 Dec 2021 09:54:24 +0100 Subject: [PATCH] Fixing Dataset for TQA + token-classification. (#14658) * Fixing Dataset for TQA + token-classification. * Fixing the tests. * Making sure `offset_mappings` is a valid argument. --- .../pipelines/table_question_answering.py | 5 ++- .../pipelines/token_classification.py | 35 ++++++++++++------- tests/test_pipelines_common.py | 7 ++-- ...test_pipelines_table_question_answering.py | 18 +++++++--- tests/test_pipelines_token_classification.py | 13 +++++++ 5 files changed, 58 insertions(+), 20 deletions(-) diff --git a/src/transformers/pipelines/table_question_answering.py b/src/transformers/pipelines/table_question_answering.py index 1ec93d1160..3634fa8c69 100644 --- a/src/transformers/pipelines/table_question_answering.py +++ b/src/transformers/pipelines/table_question_answering.py @@ -1,4 +1,5 @@ import collections +import types import numpy as np @@ -9,7 +10,7 @@ from ..file_utils import ( is_torch_available, requires_backends, ) -from .base import PIPELINE_INIT_ARGS, ArgumentHandler, Pipeline, PipelineException +from .base import PIPELINE_INIT_ARGS, ArgumentHandler, Dataset, Pipeline, PipelineException if is_torch_available(): @@ -58,6 +59,8 @@ class TableQuestionAnsweringArgumentHandler(ArgumentHandler): f"If keyword argument `table` is a list of dictionaries, each dictionary should have a `table` " f"and `query` key, but only dictionary has keys {table[0].keys()} `table` and `query` keys." ) + elif Dataset is not None and isinstance(table, Dataset) or isinstance(table, types.GeneratorType): + return table else: raise ValueError( f"Invalid input. Keyword argument `table` should be either of type `dict` or `list`, but " diff --git a/src/transformers/pipelines/token_classification.py b/src/transformers/pipelines/token_classification.py index 6324327dcd..9bfea4c378 100644 --- a/src/transformers/pipelines/token_classification.py +++ b/src/transformers/pipelines/token_classification.py @@ -1,3 +1,4 @@ +import types import warnings from typing import List, Optional, Tuple, Union @@ -5,7 +6,7 @@ import numpy as np from ..file_utils import ExplicitEnum, add_end_docstrings, is_tf_available, is_torch_available from ..models.bert.tokenization_bert import BasicTokenizer -from .base import PIPELINE_INIT_ARGS, ArgumentHandler, Pipeline +from .base import PIPELINE_INIT_ARGS, ArgumentHandler, Dataset, Pipeline if is_tf_available(): @@ -28,6 +29,8 @@ class TokenClassificationArgumentHandler(ArgumentHandler): elif isinstance(inputs, str): inputs = [inputs] batch_size = 1 + elif Dataset is not None and isinstance(inputs, Dataset) or isinstance(inputs, types.GeneratorType): + return inputs, None else: raise ValueError("At least one input is required.") @@ -112,8 +115,13 @@ class TokenClassificationPipeline(Pipeline): grouped_entities: Optional[bool] = None, ignore_subwords: Optional[bool] = None, aggregation_strategy: Optional[AggregationStrategy] = None, + offset_mapping: Optional[List[Tuple[int, int]]] = None, ): + preprocess_params = {} + if offset_mapping is not None: + preprocess_params["offset_mapping"] = offset_mapping + postprocess_params = {} if grouped_entities is not None or ignore_subwords is not None: if grouped_entities and ignore_subwords: @@ -147,7 +155,7 @@ class TokenClassificationPipeline(Pipeline): postprocess_params["aggregation_strategy"] = aggregation_strategy if ignore_labels is not None: postprocess_params["ignore_labels"] = ignore_labels - return {}, {}, postprocess_params + return preprocess_params, {}, postprocess_params def __call__(self, inputs: Union[str, List[str]], **kwargs): """ @@ -174,12 +182,13 @@ class TokenClassificationPipeline(Pipeline): Only exists if the offsets are available within the tokenizer """ - _inputs, offset_mappings = self._args_parser(inputs, **kwargs) - self.offset_mappings = offset_mappings + _inputs, offset_mapping = self._args_parser(inputs, **kwargs) + if offset_mapping: + kwargs["offset_mapping"] = offset_mapping return super().__call__(inputs, **kwargs) - def preprocess(self, sentence): + def preprocess(self, sentence, offset_mapping=None): truncation = True if self.tokenizer.model_max_length and self.tokenizer.model_max_length > 0 else False model_inputs = self.tokenizer( sentence, @@ -189,8 +198,7 @@ class TokenClassificationPipeline(Pipeline): return_special_tokens_mask=True, return_offsets_mapping=self.tokenizer.is_fast, ) - if self.offset_mappings: - offset_mapping = self.offset_mappings[0] + if offset_mapping: model_inputs["offset_mapping"] = offset_mapping model_inputs["sentence"] = sentence @@ -262,12 +270,13 @@ class TokenClassificationPipeline(Pipeline): word = self.tokenizer.convert_ids_to_tokens(int(input_ids[idx])) if offset_mapping is not None: start_ind, end_ind = offset_mapping[idx] - if self.framework == "pt": - start_ind = start_ind.item() - end_ind = end_ind.item() - else: - start_ind = int(start_ind.numpy()) - end_ind = int(end_ind.numpy()) + if not isinstance(start_ind, int): + if self.framework == "pt": + start_ind = start_ind.item() + end_ind = end_ind.item() + else: + start_ind = int(start_ind.numpy()) + end_ind = int(end_ind.numpy()) word_ref = sentence[start_ind:end_ind] if getattr(self.tokenizer._tokenizer.model, "continuing_subword_prefix", None): # This is a BPE, word aware tokenizer, there is a correct way diff --git a/tests/test_pipelines_common.py b/tests/test_pipelines_common.py index 12cee76bf8..62a1831dc4 100644 --- a/tests/test_pipelines_common.py +++ b/tests/test_pipelines_common.py @@ -183,9 +183,12 @@ class PipelineTestCaseMeta(type): # 10 examples with batch size 4 means there needs to be a unfinished batch # which is important for the unbatcher - dataset = [copy.deepcopy(random.choice(examples)) for i in range(10)] + def data(n): + for _ in range(n): + # Need to copy because Conversation object is mutated + yield copy.deepcopy(random.choice(examples)) - for item in pipeline(dataset, batch_size=4): + for item in pipeline(data(10), batch_size=4): pass run_batch_test(pipeline, examples) diff --git a/tests/test_pipelines_table_question_answering.py b/tests/test_pipelines_table_question_answering.py index 789e92c3d7..0793d6586c 100644 --- a/tests/test_pipelines_table_question_answering.py +++ b/tests/test_pipelines_table_question_answering.py @@ -35,17 +35,16 @@ from transformers.testing_utils import ( from .test_pipelines_common import PipelineTestCaseMeta -@require_tensorflow_probability -@require_torch_scatter -@require_torch -@require_pandas @is_pipeline_test class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): # Putting it there for consistency, but TQA do not have fast tokenizer # which are needed to generate automatic tests model_mapping = MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING + @require_tensorflow_probability + @require_pandas @require_tf + @require_torch def test_small_model_tf(self): model_id = "lysandre/tiny-tapas-random-wtq" model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id, from_pt=True) @@ -147,6 +146,7 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): ) @require_torch + @require_torch_scatter def test_small_model_pt(self): model_id = "lysandre/tiny-tapas-random-wtq" model = AutoModelForTableQuestionAnswering.from_pretrained(model_id) @@ -248,6 +248,7 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): ) @require_torch + @require_torch_scatter def test_slow_tokenizer_sqa_pt(self): model_id = "lysandre/tiny-tapas-random-sqa" model = AutoModelForTableQuestionAnswering.from_pretrained(model_id) @@ -366,6 +367,9 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): ) @require_tf + @require_tensorflow_probability + @require_pandas + @require_torch def test_slow_tokenizer_sqa_tf(self): model_id = "lysandre/tiny-tapas-random-sqa" model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id, from_pt=True) @@ -484,6 +488,7 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): ) @slow + @require_torch_scatter def test_integration_wtq_pt(self): table_querier = pipeline("table-question-answering") @@ -528,6 +533,8 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): self.assertListEqual(results, expected_results) @slow + @require_tensorflow_probability + @require_pandas def test_integration_wtq_tf(self): model_id = "google/tapas-base-finetuned-wtq" model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id) @@ -575,6 +582,7 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): self.assertListEqual(results, expected_results) @slow + @require_torch_scatter def test_integration_sqa_pt(self): table_querier = pipeline( "table-question-answering", @@ -598,6 +606,8 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): self.assertListEqual(results, expected_results) @slow + @require_tensorflow_probability + @require_pandas def test_integration_sqa_tf(self): model_id = "google/tapas-base-finetuned-sqa" model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id) diff --git a/tests/test_pipelines_token_classification.py b/tests/test_pipelines_token_classification.py index 8c469e14ab..94ac7a19ce 100644 --- a/tests/test_pipelines_token_classification.py +++ b/tests/test_pipelines_token_classification.py @@ -636,6 +636,19 @@ class TokenClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest [], ) + token_classifier = pipeline(task="token-classification", model=model_name, framework="pt") + # Overload offset_mapping + outputs = token_classifier( + "This is a test !", offset_mapping=[(0, 0), (0, 1), (0, 2), (0, 0), (0, 0), (0, 0), (0, 0)] + ) + self.assertEqual( + nested_simplify(outputs), + [ + {"entity": "I-MISC", "score": 0.115, "index": 1, "word": "this", "start": 0, "end": 1}, + {"entity": "I-MISC", "score": 0.115, "index": 2, "word": "is", "start": 0, "end": 2}, + ], + ) + @require_torch def test_pt_ignore_subwords_slow_tokenizer_raises(self): model_name = "sshleifer/tiny-dbmdz-bert-large-cased-finetuned-conll03-english"