Fixing Dataset for TQA + token-classification. (#14658)
* Fixing Dataset for TQA + token-classification. * Fixing the tests. * Making sure `offset_mappings` is a valid argument.
This commit is contained in:
@@ -1,4 +1,5 @@
|
|||||||
import collections
|
import collections
|
||||||
|
import types
|
||||||
|
|
||||||
import numpy as np
|
import numpy as np
|
||||||
|
|
||||||
@@ -9,7 +10,7 @@ from ..file_utils import (
|
|||||||
is_torch_available,
|
is_torch_available,
|
||||||
requires_backends,
|
requires_backends,
|
||||||
)
|
)
|
||||||
from .base import PIPELINE_INIT_ARGS, ArgumentHandler, Pipeline, PipelineException
|
from .base import PIPELINE_INIT_ARGS, ArgumentHandler, Dataset, Pipeline, PipelineException
|
||||||
|
|
||||||
|
|
||||||
if is_torch_available():
|
if is_torch_available():
|
||||||
@@ -58,6 +59,8 @@ class TableQuestionAnsweringArgumentHandler(ArgumentHandler):
|
|||||||
f"If keyword argument `table` is a list of dictionaries, each dictionary should have a `table` "
|
f"If keyword argument `table` is a list of dictionaries, each dictionary should have a `table` "
|
||||||
f"and `query` key, but only dictionary has keys {table[0].keys()} `table` and `query` keys."
|
f"and `query` key, but only dictionary has keys {table[0].keys()} `table` and `query` keys."
|
||||||
)
|
)
|
||||||
|
elif Dataset is not None and isinstance(table, Dataset) or isinstance(table, types.GeneratorType):
|
||||||
|
return table
|
||||||
else:
|
else:
|
||||||
raise ValueError(
|
raise ValueError(
|
||||||
f"Invalid input. Keyword argument `table` should be either of type `dict` or `list`, but "
|
f"Invalid input. Keyword argument `table` should be either of type `dict` or `list`, but "
|
||||||
|
|||||||
@@ -1,3 +1,4 @@
|
|||||||
|
import types
|
||||||
import warnings
|
import warnings
|
||||||
from typing import List, Optional, Tuple, Union
|
from typing import List, Optional, Tuple, Union
|
||||||
|
|
||||||
@@ -5,7 +6,7 @@ import numpy as np
|
|||||||
|
|
||||||
from ..file_utils import ExplicitEnum, add_end_docstrings, is_tf_available, is_torch_available
|
from ..file_utils import ExplicitEnum, add_end_docstrings, is_tf_available, is_torch_available
|
||||||
from ..models.bert.tokenization_bert import BasicTokenizer
|
from ..models.bert.tokenization_bert import BasicTokenizer
|
||||||
from .base import PIPELINE_INIT_ARGS, ArgumentHandler, Pipeline
|
from .base import PIPELINE_INIT_ARGS, ArgumentHandler, Dataset, Pipeline
|
||||||
|
|
||||||
|
|
||||||
if is_tf_available():
|
if is_tf_available():
|
||||||
@@ -28,6 +29,8 @@ class TokenClassificationArgumentHandler(ArgumentHandler):
|
|||||||
elif isinstance(inputs, str):
|
elif isinstance(inputs, str):
|
||||||
inputs = [inputs]
|
inputs = [inputs]
|
||||||
batch_size = 1
|
batch_size = 1
|
||||||
|
elif Dataset is not None and isinstance(inputs, Dataset) or isinstance(inputs, types.GeneratorType):
|
||||||
|
return inputs, None
|
||||||
else:
|
else:
|
||||||
raise ValueError("At least one input is required.")
|
raise ValueError("At least one input is required.")
|
||||||
|
|
||||||
@@ -112,8 +115,13 @@ class TokenClassificationPipeline(Pipeline):
|
|||||||
grouped_entities: Optional[bool] = None,
|
grouped_entities: Optional[bool] = None,
|
||||||
ignore_subwords: Optional[bool] = None,
|
ignore_subwords: Optional[bool] = None,
|
||||||
aggregation_strategy: Optional[AggregationStrategy] = None,
|
aggregation_strategy: Optional[AggregationStrategy] = None,
|
||||||
|
offset_mapping: Optional[List[Tuple[int, int]]] = None,
|
||||||
):
|
):
|
||||||
|
|
||||||
|
preprocess_params = {}
|
||||||
|
if offset_mapping is not None:
|
||||||
|
preprocess_params["offset_mapping"] = offset_mapping
|
||||||
|
|
||||||
postprocess_params = {}
|
postprocess_params = {}
|
||||||
if grouped_entities is not None or ignore_subwords is not None:
|
if grouped_entities is not None or ignore_subwords is not None:
|
||||||
if grouped_entities and ignore_subwords:
|
if grouped_entities and ignore_subwords:
|
||||||
@@ -147,7 +155,7 @@ class TokenClassificationPipeline(Pipeline):
|
|||||||
postprocess_params["aggregation_strategy"] = aggregation_strategy
|
postprocess_params["aggregation_strategy"] = aggregation_strategy
|
||||||
if ignore_labels is not None:
|
if ignore_labels is not None:
|
||||||
postprocess_params["ignore_labels"] = ignore_labels
|
postprocess_params["ignore_labels"] = ignore_labels
|
||||||
return {}, {}, postprocess_params
|
return preprocess_params, {}, postprocess_params
|
||||||
|
|
||||||
def __call__(self, inputs: Union[str, List[str]], **kwargs):
|
def __call__(self, inputs: Union[str, List[str]], **kwargs):
|
||||||
"""
|
"""
|
||||||
@@ -174,12 +182,13 @@ class TokenClassificationPipeline(Pipeline):
|
|||||||
Only exists if the offsets are available within the tokenizer
|
Only exists if the offsets are available within the tokenizer
|
||||||
"""
|
"""
|
||||||
|
|
||||||
_inputs, offset_mappings = self._args_parser(inputs, **kwargs)
|
_inputs, offset_mapping = self._args_parser(inputs, **kwargs)
|
||||||
self.offset_mappings = offset_mappings
|
if offset_mapping:
|
||||||
|
kwargs["offset_mapping"] = offset_mapping
|
||||||
|
|
||||||
return super().__call__(inputs, **kwargs)
|
return super().__call__(inputs, **kwargs)
|
||||||
|
|
||||||
def preprocess(self, sentence):
|
def preprocess(self, sentence, offset_mapping=None):
|
||||||
truncation = True if self.tokenizer.model_max_length and self.tokenizer.model_max_length > 0 else False
|
truncation = True if self.tokenizer.model_max_length and self.tokenizer.model_max_length > 0 else False
|
||||||
model_inputs = self.tokenizer(
|
model_inputs = self.tokenizer(
|
||||||
sentence,
|
sentence,
|
||||||
@@ -189,8 +198,7 @@ class TokenClassificationPipeline(Pipeline):
|
|||||||
return_special_tokens_mask=True,
|
return_special_tokens_mask=True,
|
||||||
return_offsets_mapping=self.tokenizer.is_fast,
|
return_offsets_mapping=self.tokenizer.is_fast,
|
||||||
)
|
)
|
||||||
if self.offset_mappings:
|
if offset_mapping:
|
||||||
offset_mapping = self.offset_mappings[0]
|
|
||||||
model_inputs["offset_mapping"] = offset_mapping
|
model_inputs["offset_mapping"] = offset_mapping
|
||||||
|
|
||||||
model_inputs["sentence"] = sentence
|
model_inputs["sentence"] = sentence
|
||||||
@@ -262,6 +270,7 @@ class TokenClassificationPipeline(Pipeline):
|
|||||||
word = self.tokenizer.convert_ids_to_tokens(int(input_ids[idx]))
|
word = self.tokenizer.convert_ids_to_tokens(int(input_ids[idx]))
|
||||||
if offset_mapping is not None:
|
if offset_mapping is not None:
|
||||||
start_ind, end_ind = offset_mapping[idx]
|
start_ind, end_ind = offset_mapping[idx]
|
||||||
|
if not isinstance(start_ind, int):
|
||||||
if self.framework == "pt":
|
if self.framework == "pt":
|
||||||
start_ind = start_ind.item()
|
start_ind = start_ind.item()
|
||||||
end_ind = end_ind.item()
|
end_ind = end_ind.item()
|
||||||
|
|||||||
@@ -183,9 +183,12 @@ class PipelineTestCaseMeta(type):
|
|||||||
|
|
||||||
# 10 examples with batch size 4 means there needs to be a unfinished batch
|
# 10 examples with batch size 4 means there needs to be a unfinished batch
|
||||||
# which is important for the unbatcher
|
# which is important for the unbatcher
|
||||||
dataset = [copy.deepcopy(random.choice(examples)) for i in range(10)]
|
def data(n):
|
||||||
|
for _ in range(n):
|
||||||
|
# Need to copy because Conversation object is mutated
|
||||||
|
yield copy.deepcopy(random.choice(examples))
|
||||||
|
|
||||||
for item in pipeline(dataset, batch_size=4):
|
for item in pipeline(data(10), batch_size=4):
|
||||||
pass
|
pass
|
||||||
|
|
||||||
run_batch_test(pipeline, examples)
|
run_batch_test(pipeline, examples)
|
||||||
|
|||||||
@@ -35,17 +35,16 @@ from transformers.testing_utils import (
|
|||||||
from .test_pipelines_common import PipelineTestCaseMeta
|
from .test_pipelines_common import PipelineTestCaseMeta
|
||||||
|
|
||||||
|
|
||||||
@require_tensorflow_probability
|
|
||||||
@require_torch_scatter
|
|
||||||
@require_torch
|
|
||||||
@require_pandas
|
|
||||||
@is_pipeline_test
|
@is_pipeline_test
|
||||||
class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
|
class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
|
||||||
# Putting it there for consistency, but TQA do not have fast tokenizer
|
# Putting it there for consistency, but TQA do not have fast tokenizer
|
||||||
# which are needed to generate automatic tests
|
# which are needed to generate automatic tests
|
||||||
model_mapping = MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING
|
model_mapping = MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING
|
||||||
|
|
||||||
|
@require_tensorflow_probability
|
||||||
|
@require_pandas
|
||||||
@require_tf
|
@require_tf
|
||||||
|
@require_torch
|
||||||
def test_small_model_tf(self):
|
def test_small_model_tf(self):
|
||||||
model_id = "lysandre/tiny-tapas-random-wtq"
|
model_id = "lysandre/tiny-tapas-random-wtq"
|
||||||
model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id, from_pt=True)
|
model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id, from_pt=True)
|
||||||
@@ -147,6 +146,7 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
|
|||||||
)
|
)
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
|
@require_torch_scatter
|
||||||
def test_small_model_pt(self):
|
def test_small_model_pt(self):
|
||||||
model_id = "lysandre/tiny-tapas-random-wtq"
|
model_id = "lysandre/tiny-tapas-random-wtq"
|
||||||
model = AutoModelForTableQuestionAnswering.from_pretrained(model_id)
|
model = AutoModelForTableQuestionAnswering.from_pretrained(model_id)
|
||||||
@@ -248,6 +248,7 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
|
|||||||
)
|
)
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
|
@require_torch_scatter
|
||||||
def test_slow_tokenizer_sqa_pt(self):
|
def test_slow_tokenizer_sqa_pt(self):
|
||||||
model_id = "lysandre/tiny-tapas-random-sqa"
|
model_id = "lysandre/tiny-tapas-random-sqa"
|
||||||
model = AutoModelForTableQuestionAnswering.from_pretrained(model_id)
|
model = AutoModelForTableQuestionAnswering.from_pretrained(model_id)
|
||||||
@@ -366,6 +367,9 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
|
|||||||
)
|
)
|
||||||
|
|
||||||
@require_tf
|
@require_tf
|
||||||
|
@require_tensorflow_probability
|
||||||
|
@require_pandas
|
||||||
|
@require_torch
|
||||||
def test_slow_tokenizer_sqa_tf(self):
|
def test_slow_tokenizer_sqa_tf(self):
|
||||||
model_id = "lysandre/tiny-tapas-random-sqa"
|
model_id = "lysandre/tiny-tapas-random-sqa"
|
||||||
model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id, from_pt=True)
|
model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id, from_pt=True)
|
||||||
@@ -484,6 +488,7 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
|
|||||||
)
|
)
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
|
@require_torch_scatter
|
||||||
def test_integration_wtq_pt(self):
|
def test_integration_wtq_pt(self):
|
||||||
table_querier = pipeline("table-question-answering")
|
table_querier = pipeline("table-question-answering")
|
||||||
|
|
||||||
@@ -528,6 +533,8 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
|
|||||||
self.assertListEqual(results, expected_results)
|
self.assertListEqual(results, expected_results)
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
|
@require_tensorflow_probability
|
||||||
|
@require_pandas
|
||||||
def test_integration_wtq_tf(self):
|
def test_integration_wtq_tf(self):
|
||||||
model_id = "google/tapas-base-finetuned-wtq"
|
model_id = "google/tapas-base-finetuned-wtq"
|
||||||
model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id)
|
model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id)
|
||||||
@@ -575,6 +582,7 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
|
|||||||
self.assertListEqual(results, expected_results)
|
self.assertListEqual(results, expected_results)
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
|
@require_torch_scatter
|
||||||
def test_integration_sqa_pt(self):
|
def test_integration_sqa_pt(self):
|
||||||
table_querier = pipeline(
|
table_querier = pipeline(
|
||||||
"table-question-answering",
|
"table-question-answering",
|
||||||
@@ -598,6 +606,8 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
|
|||||||
self.assertListEqual(results, expected_results)
|
self.assertListEqual(results, expected_results)
|
||||||
|
|
||||||
@slow
|
@slow
|
||||||
|
@require_tensorflow_probability
|
||||||
|
@require_pandas
|
||||||
def test_integration_sqa_tf(self):
|
def test_integration_sqa_tf(self):
|
||||||
model_id = "google/tapas-base-finetuned-sqa"
|
model_id = "google/tapas-base-finetuned-sqa"
|
||||||
model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id)
|
model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id)
|
||||||
|
|||||||
@@ -636,6 +636,19 @@ class TokenClassificationPipelineTests(unittest.TestCase, metaclass=PipelineTest
|
|||||||
[],
|
[],
|
||||||
)
|
)
|
||||||
|
|
||||||
|
token_classifier = pipeline(task="token-classification", model=model_name, framework="pt")
|
||||||
|
# Overload offset_mapping
|
||||||
|
outputs = token_classifier(
|
||||||
|
"This is a test !", offset_mapping=[(0, 0), (0, 1), (0, 2), (0, 0), (0, 0), (0, 0), (0, 0)]
|
||||||
|
)
|
||||||
|
self.assertEqual(
|
||||||
|
nested_simplify(outputs),
|
||||||
|
[
|
||||||
|
{"entity": "I-MISC", "score": 0.115, "index": 1, "word": "this", "start": 0, "end": 1},
|
||||||
|
{"entity": "I-MISC", "score": 0.115, "index": 2, "word": "is", "start": 0, "end": 2},
|
||||||
|
],
|
||||||
|
)
|
||||||
|
|
||||||
@require_torch
|
@require_torch
|
||||||
def test_pt_ignore_subwords_slow_tokenizer_raises(self):
|
def test_pt_ignore_subwords_slow_tokenizer_raises(self):
|
||||||
model_name = "sshleifer/tiny-dbmdz-bert-large-cased-finetuned-conll03-english"
|
model_name = "sshleifer/tiny-dbmdz-bert-large-cased-finetuned-conll03-english"
|
||||||
|
|||||||
Reference in New Issue
Block a user