From 1c1a2ffbff2052100053cddb3a87d45fb9d210ca Mon Sep 17 00:00:00 2001 From: Lysandre Debut Date: Wed, 16 Dec 2020 12:31:50 -0500 Subject: [PATCH] TableQuestionAnsweringPipeline (#9145) * AutoModelForTableQuestionAnswering * TableQuestionAnsweringPipeline * Apply suggestions from Patrick's code review Co-authored-by: Patrick von Platen * Sylvain and Patrick comments * Better PyTorch/TF error message * Add integration tests * Argument Handler naming Co-authored-by: patrickvonplaten * Fix docs to appease the documentation gods Co-authored-by: Patrick von Platen --- docs/source/main_classes/pipelines.rst | 8 + src/transformers/__init__.py | 1 + src/transformers/file_utils.py | 13 + .../models/tapas/configuration_tapas.py | 49 ++- .../models/tapas/tokenization_tapas.py | 18 +- src/transformers/pipelines.py | 293 +++++++++++++++++- src/transformers/testing_utils.py | 13 + ...test_pipelines_table_question_answering.py | 234 ++++++++++++++ 8 files changed, 602 insertions(+), 27 deletions(-) create mode 100644 tests/test_pipelines_table_question_answering.py diff --git a/docs/source/main_classes/pipelines.rst b/docs/source/main_classes/pipelines.rst index 1e4bb4a43c..04ec19c9a5 100644 --- a/docs/source/main_classes/pipelines.rst +++ b/docs/source/main_classes/pipelines.rst @@ -34,6 +34,7 @@ There are two categories of pipeline abstractions to be aware about: - :class:`~transformers.TranslationPipeline` - :class:`~transformers.ZeroShotClassificationPipeline` - :class:`~transformers.Text2TextGenerationPipeline` + - :class:`~transformers.TableQuestionAnsweringPipeline` The pipeline abstraction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -91,6 +92,13 @@ SummarizationPipeline :special-members: __call__ :members: +TableQuestionAnsweringPipeline +======================================================================================================================= + +.. autoclass:: transformers.TableQuestionAnsweringPipeline + :special-members: __call__ + + TextClassificationPipeline ======================================================================================================================= diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 7ed45b55db..eadf725f40 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -190,6 +190,7 @@ from .pipelines import ( PipelineDataFormat, QuestionAnsweringPipeline, SummarizationPipeline, + TableQuestionAnsweringPipeline, Text2TextGenerationPipeline, TextClassificationPipeline, TextGenerationPipeline, diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py index 27e28cbfa3..85dca20e20 100644 --- a/src/transformers/file_utils.py +++ b/src/transformers/file_utils.py @@ -468,6 +468,13 @@ explained here: https://github.com/rusty1s/pytorch_scatter. """ +# docstyle-ignore +PANDAS_IMPORT_ERROR = """ +{0} requires the pandas library but it was not found in your environment. You can install it with pip as +explained here: https://pandas.pydata.org/pandas-docs/stable/getting_started/install.html. +""" + + def requires_datasets(obj): name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__ if not is_datasets_available(): @@ -522,6 +529,12 @@ def requires_protobuf(obj): raise ImportError(PROTOBUF_IMPORT_ERROR.format(name)) +def requires_pandas(obj): + name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__ + if not is_pandas_available(): + raise ImportError(PANDAS_IMPORT_ERROR.format(name)) + + def requires_scatter(obj): name = obj.__name__ if hasattr(obj, "__name__") else obj.__class__.__name__ if not is_scatter_available(): diff --git a/src/transformers/models/tapas/configuration_tapas.py b/src/transformers/models/tapas/configuration_tapas.py index 6d67fcbe23..834cae0c7e 100644 --- a/src/transformers/models/tapas/configuration_tapas.py +++ b/src/transformers/models/tapas/configuration_tapas.py @@ -87,41 +87,49 @@ class TapasConfig(PretrainedConfig): Importance weight for the regression loss. use_normalized_answer_loss (:obj:`bool`, `optional`, defaults to :obj:`False`): Whether to normalize the answer loss by the maximum of the predicted and expected value. - huber_loss_delta: (:obj:`float`, `optional`): + huber_loss_delta (:obj:`float`, `optional`): Delta parameter used to calculate the regression loss. - temperature: (:obj:`float`, `optional`, defaults to 1.0): + temperature (:obj:`float`, `optional`, defaults to 1.0): Value used to control (OR change) the skewness of cell logits probabilities. - aggregation_temperature: (:obj:`float`, `optional`, defaults to 1.0): + aggregation_temperature (:obj:`float`, `optional`, defaults to 1.0): Scales aggregation logits to control the skewness of probabilities. - use_gumbel_for_cells: (:obj:`bool`, `optional`, defaults to :obj:`False`): + use_gumbel_for_cells (:obj:`bool`, `optional`, defaults to :obj:`False`): Whether to apply Gumbel-Softmax to cell selection. - use_gumbel_for_aggregation: (:obj:`bool`, `optional`, defaults to :obj:`False`): + use_gumbel_for_aggregation (:obj:`bool`, `optional`, defaults to :obj:`False`): Whether to apply Gumbel-Softmax to aggregation selection. - average_approximation_function: (:obj:`string`, `optional`, defaults to :obj:`"ratio"`): + average_approximation_function (:obj:`string`, `optional`, defaults to :obj:`"ratio"`): Method to calculate the expected average of cells in the weak supervision case. One of :obj:`"ratio"`, :obj:`"first_order"` or :obj:`"second_order"`. - cell_selection_preference: (:obj:`float`, `optional`): + cell_selection_preference (:obj:`float`, `optional`): Preference for cell selection in ambiguous cases. Only applicable in case of weak supervision for aggregation (WTQ, WikiSQL). If the total mass of the aggregation probabilities (excluding the "NONE" operator) is higher than this hyperparameter, then aggregation is predicted for an example. - answer_loss_cutoff: (:obj:`float`, `optional`): + answer_loss_cutoff (:obj:`float`, `optional`): Ignore examples with answer loss larger than cutoff. - max_num_rows: (:obj:`int`, `optional`, defaults to 64): + max_num_rows (:obj:`int`, `optional`, defaults to 64): Maximum number of rows. - max_num_columns: (:obj:`int`, `optional`, defaults to 32): + max_num_columns (:obj:`int`, `optional`, defaults to 32): Maximum number of columns. - average_logits_per_cell: (:obj:`bool`, `optional`, defaults to :obj:`False`): + average_logits_per_cell (:obj:`bool`, `optional`, defaults to :obj:`False`): Whether to average logits per cell. - select_one_column: (:obj:`bool`, `optional`, defaults to :obj:`True`): + select_one_column (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to constrain the model to only select cells from a single column. - allow_empty_column_selection: (:obj:`bool`, `optional`, defaults to :obj:`False`): + allow_empty_column_selection (:obj:`bool`, `optional`, defaults to :obj:`False`): Whether to allow not to select any column. - init_cell_selection_weights_to_zero: (:obj:`bool`, `optional`, defaults to :obj:`False`): + init_cell_selection_weights_to_zero (:obj:`bool`, `optional`, defaults to :obj:`False`): Whether to initialize cell selection weights to 0 so that the initial probabilities are 50%. - reset_position_index_per_cell: (:obj:`bool`, `optional`, defaults to :obj:`True`): + reset_position_index_per_cell (:obj:`bool`, `optional`, defaults to :obj:`True`): Whether to restart position indexes at every cell (i.e. use relative position embeddings). - disable_per_token_loss: (:obj:`bool`, `optional`, defaults to :obj:`False`): + disable_per_token_loss (:obj:`bool`, `optional`, defaults to :obj:`False`): Whether to disable any (strong or weak) supervision on cells. + aggregation_labels (:obj:`Dict[int, label]`, `optional`): + The aggregation labels used to aggregate the results. For example, the WTQ models have the following + aggregation labels: :obj:`{0: "NONE", 1: "SUM", 2: "AVERAGE", 3: "COUNT"}` + no_aggregation_label_index (:obj:`int`, `optional`): + If the aggregation labels are defined and one of these labels represents "No aggregation", this should be + set to its index. For example, the WTQ models have the "NONE" aggregation label at index 0, so that value + should be set to 0 for these models. + Example:: @@ -174,6 +182,8 @@ class TapasConfig(PretrainedConfig): init_cell_selection_weights_to_zero=False, reset_position_index_per_cell=True, disable_per_token_loss=False, + aggregation_labels=None, + no_aggregation_label_index=None, **kwargs ): @@ -217,3 +227,10 @@ class TapasConfig(PretrainedConfig): self.init_cell_selection_weights_to_zero = init_cell_selection_weights_to_zero self.reset_position_index_per_cell = reset_position_index_per_cell self.disable_per_token_loss = disable_per_token_loss + + # Aggregation hyperparameters + self.aggregation_labels = aggregation_labels + self.no_aggregation_label_index = no_aggregation_label_index + + if isinstance(self.aggregation_labels, dict): + self.aggregation_labels = {int(k): v for k, v in aggregation_labels.items()} diff --git a/src/transformers/models/tapas/tokenization_tapas.py b/src/transformers/models/tapas/tokenization_tapas.py index 6b83c289d5..b90f9e47e2 100644 --- a/src/transformers/models/tapas/tokenization_tapas.py +++ b/src/transformers/models/tapas/tokenization_tapas.py @@ -1905,12 +1905,14 @@ class TapasTokenizer(PreTrainedTokenizer): this threshold will be selected. Returns: - :obj:`tuple` comprising various elements depending on the inputs: predicted_answer_coordinates - (``List[List[[tuple]]`` of length ``batch_size``): Predicted answer coordinates as a list of lists of - tuples. Each element in the list contains the predicted answer coordinates of a single example in the - batch, as a list of tuples. Each tuple is a cell, i.e. (row index, column index). - predicted_aggregation_indices (`optional`, returned when ``logits_aggregation`` is provided) ``List[int]`` - of length ``batch_size``: Predicted aggregation operator indices of the aggregation head. + :obj:`tuple` comprising various elements depending on the inputs: + + - predicted_answer_coordinates (``List[List[[tuple]]`` of length ``batch_size``): Predicted answer + coordinates as a list of lists of tuples. Each element in the list contains the predicted answer + coordinates of a single example in the batch, as a list of tuples. Each tuple is a cell, i.e. (row index, + column index). + - predicted_aggregation_indices (``List[int]``of length ``batch_size``, `optional`, returned when + ``logits_aggregation`` is provided): Predicted aggregation operator indices of the aggregation head. """ # input data is of type float32 # np.log(np.finfo(np.float32).max) = 88.72284 @@ -1969,11 +1971,11 @@ class TapasTokenizer(PreTrainedTokenizer): answer_coordinates = sorted(answer_coordinates) predicted_answer_coordinates.append(answer_coordinates) - output = predicted_answer_coordinates + output = (predicted_answer_coordinates,) if logits_agg is not None: predicted_aggregation_indices = logits_agg.argmax(dim=-1) - output = (output, predicted_aggregation_indices.tolist()) + output = (predicted_answer_coordinates, predicted_aggregation_indices.tolist()) return output diff --git a/src/transformers/pipelines.py b/src/transformers/pipelines.py index 224f5f3ac0..5d0376c751 100755 --- a/src/transformers/pipelines.py +++ b/src/transformers/pipelines.py @@ -12,8 +12,7 @@ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. - - +import collections import csv import json import os @@ -32,7 +31,7 @@ import numpy as np from .configuration_utils import PretrainedConfig from .data import SquadExample, SquadFeatures, squad_convert_examples_to_features -from .file_utils import add_end_docstrings, is_tf_available, is_torch_available +from .file_utils import add_end_docstrings, is_tf_available, is_torch_available, requires_pandas from .modelcard import ModelCard from .models.auto.configuration_auto import AutoConfig from .models.auto.tokenization_auto import AutoTokenizer @@ -68,6 +67,7 @@ if is_torch_available(): MODEL_FOR_QUESTION_ANSWERING_MAPPING, MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, + MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING, MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, AutoModel, AutoModelForCausalLM, @@ -75,6 +75,7 @@ if is_torch_available(): AutoModelForQuestionAnswering, AutoModelForSeq2SeqLM, AutoModelForSequenceClassification, + AutoModelForTableQuestionAnswering, AutoModelForTokenClassification, ) @@ -2058,6 +2059,274 @@ class QuestionAnsweringPipeline(Pipeline): } +class TableQuestionAnsweringArgumentHandler(ArgumentHandler): + """ + Handles arguments for the TableQuestionAnsweringPipeline + """ + + def __call__(self, table=None, query=None, sequential=False, padding=True, truncation=True): + # Returns tqa_pipeline_inputs of shape: + # [ + # {"table": pd.DataFrame, "query": List[str]}, + # ..., + # {"table": pd.DataFrame, "query" : List[str]} + # ] + requires_pandas(self) + import pandas as pd + + if table is None: + raise ValueError("Keyword argument `table` cannot be None.") + elif query is None: + if isinstance(table, dict) and table.get("query") is not None and table.get("table") is not None: + tqa_pipeline_inputs = [table] + elif isinstance(table, list) and len(table) > 0: + if not all(isinstance(d, dict) for d in table): + raise ValueError( + f"Keyword argument `table` should be a list of dict, but is {(type(d) for d in table)}" + ) + + if table[0].get("query") is not None and table[0].get("table") is not None: + tqa_pipeline_inputs = table + else: + raise ValueError( + f"If keyword argument `table` is a list of dictionaries, each dictionary should have a `table` " + f"and `query` key, but only dictionary has keys {table[0].keys()} `table` and `query` keys." + ) + else: + raise ValueError( + f"Invalid input. Keyword argument `table` should be either of type `dict` or `list`, but " + f"is {type(table)})" + ) + else: + tqa_pipeline_inputs = [{"table": table, "query": query}] + + for tqa_pipeline_input in tqa_pipeline_inputs: + if not isinstance(tqa_pipeline_input["table"], pd.DataFrame): + if tqa_pipeline_input["table"] is None: + raise ValueError("Table cannot be None.") + + tqa_pipeline_input["table"] = pd.DataFrame(tqa_pipeline_input["table"]) + + return tqa_pipeline_inputs, sequential, padding, truncation + + +@add_end_docstrings(PIPELINE_INIT_ARGS) +class TableQuestionAnsweringPipeline(Pipeline): + """ + Table Question Answering pipeline using a :obj:`ModelForTableQuestionAnswering`. This pipeline is only available in + PyTorch. + + This tabular question answering pipeline can currently be loaded from :func:`~transformers.pipeline` using the + following task identifier: :obj:`"table-question-answering"`. + + The models that this pipeline can use are models that have been fine-tuned on a tabular question answering task. + See the up-to-date list of available models on `huggingface.co/models + `__. + """ + + default_input_names = "table,query" + + def __init__(self, args_parser=TableQuestionAnsweringArgumentHandler(), *args, **kwargs): + super().__init__(*args, **kwargs) + self._args_parser = args_parser + + if self.framework == "tf": + raise ValueError("The TableQuestionAnsweringPipeline is only available in PyTorch.") + + self.check_model_type(MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING) + + self.aggregate = bool(getattr(self.model.config, "aggregation_labels")) and bool( + getattr(self.model.config, "num_aggregation_labels") + ) + + def batch_inference(self, **inputs): + with torch.no_grad(): + return self.model(**inputs) + + def sequential_inference(self, **inputs): + """ + Inference used for models that need to process sequences in a sequential fashion, like the SQA models which + handle conversational query related to a table. + """ + with torch.no_grad(): + all_logits = [] + all_aggregations = [] + prev_answers = None + batch_size = inputs["input_ids"].shape[0] + + input_ids = inputs["input_ids"].to(self.device) + attention_mask = inputs["attention_mask"].to(self.device) + token_type_ids = inputs["token_type_ids"].to(self.device) + token_type_ids_example = None + + for index in range(batch_size): + # If sequences have already been processed, the token type IDs will be created according to the previous + # answer. + if prev_answers is not None: + prev_labels_example = token_type_ids_example[:, 3] # shape (seq_len,) + model_labels = np.zeros_like(prev_labels_example.cpu().numpy()) # shape (seq_len,) + + token_type_ids_example = token_type_ids[index] # shape (seq_len, 7) + for i in range(model_labels.shape[0]): + segment_id = token_type_ids_example[:, 0].tolist()[i] + col_id = token_type_ids_example[:, 1].tolist()[i] - 1 + row_id = token_type_ids_example[:, 2].tolist()[i] - 1 + + if row_id >= 0 and col_id >= 0 and segment_id == 1: + model_labels[i] = int(prev_answers[(col_id, row_id)]) + + token_type_ids_example[:, 3] = torch.from_numpy(model_labels).type(torch.long).to(self.device) + + input_ids_example = input_ids[index] + attention_mask_example = attention_mask[index] # shape (seq_len,) + token_type_ids_example = token_type_ids[index] # shape (seq_len, 7) + outputs = self.model( + input_ids=input_ids_example.unsqueeze(0), + attention_mask=attention_mask_example.unsqueeze(0), + token_type_ids=token_type_ids_example.unsqueeze(0), + ) + logits = outputs.logits + + if self.aggregate: + all_aggregations.append(outputs.logits_aggregation) + + all_logits.append(logits) + + dist_per_token = torch.distributions.Bernoulli(logits=logits) + probabilities = dist_per_token.probs * attention_mask_example.type(torch.float32).to( + dist_per_token.probs.device + ) + + coords_to_probs = collections.defaultdict(list) + for i, p in enumerate(probabilities.squeeze().tolist()): + segment_id = token_type_ids_example[:, 0].tolist()[i] + col = token_type_ids_example[:, 1].tolist()[i] - 1 + row = token_type_ids_example[:, 2].tolist()[i] - 1 + if col >= 0 and row >= 0 and segment_id == 1: + coords_to_probs[(col, row)].append(p) + + prev_answers = {key: np.array(coords_to_probs[key]).mean() > 0.5 for key in coords_to_probs} + + logits_batch = torch.cat(tuple(all_logits), 0) + + return (logits_batch,) if not self.aggregate else (logits_batch, torch.cat(tuple(all_aggregations), 0)) + + def __call__(self, *args, **kwargs): + r""" + Answers queries according to a table. The pipeline accepts several types of inputs which are detailed below: + + - ``pipeline(table, query)`` + - ``pipeline(table, [query])`` + - ``pipeline(table=table, query=query)`` + - ``pipeline(table=table, query=[query])`` + - ``pipeline({"table": table, "query": query})`` + - ``pipeline({"table": table, "query": [query]})`` + - ``pipeline([{"table": table, "query": query}, {"table": table, "query": query}])`` + + The :obj:`table` argument should be a dict or a DataFrame built from that dict, containing the whole table: + + Example:: + + data = { + "actors": ["brad pitt", "leonardo di caprio", "george clooney"], + "age": ["56", "45", "59"], + "number of movies": ["87", "53", "69"], + "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"], + } + + This dictionary can be passed in as such, or can be converted to a pandas DataFrame: + + Example:: + + import pandas as pd + table = pd.DataFrame.from_dict(data) + + + Args: + table (:obj:`pd.DataFrame` or :obj:`Dict`): + Pandas DataFrame or dictionary that will be converted to a DataFrame containing all the table values. + See above for an example of dictionary. + query (:obj:`str` or :obj:`List[str]`): + Query or list of queries that will be sent to the model alongside the table. + sequential (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether to do inference sequentially or as a batch. Batching is faster, but models like SQA require the + inference to be done sequentially to extract relations within sequences, given their conversational + nature. + padding (:obj:`bool`, :obj:`str` or :class:`~transformers.tokenization_utils_base.PaddingStrategy`, `optional`, defaults to :obj:`False`): + Activates and controls padding. Accepts the following values: + + * :obj:`True` or :obj:`'longest'`: Pad to the longest sequence in the batch (or no padding if only a + single sequence if provided). + * :obj:`'max_length'`: Pad to a maximum length specified with the argument :obj:`max_length` or to the + maximum acceptable input length for the model if that argument is not provided. + * :obj:`False` or :obj:`'do_not_pad'` (default): No padding (i.e., can output a batch with sequences of + different lengths). + + truncation (:obj:`bool`, :obj:`str` or :class:`~transformers.TapasTruncationStrategy`, `optional`, defaults to :obj:`False`): + Activates and controls truncation. Accepts the following values: + + * :obj:`True` or :obj:`'drop_rows_to_fit'`: Truncate to a maximum length specified with the argument + :obj:`max_length` or to the maximum acceptable input length for the model if that argument is not + provided. This will truncate row by row, removing rows from the table. + * :obj:`False` or :obj:`'do_not_truncate'` (default): No truncation (i.e., can output batch with + sequence lengths greater than the model maximum admissible input size). + + + Return: + A dictionary or a list of dictionaries containing results: Each result is a dictionary with the following + keys: + + - **answer** (:obj:`str`) -- The answer of the query given the table. If there is an aggregator, the answer + will be preceded by :obj:`AGGREGATOR >`. + - **coordinates** (:obj:`List[Tuple[int, int]]`) -- Coordinates of the cells of the answers. + - **cells** (:obj:`List[str]`) -- List of strings made up of the answer cell values. + - **aggregator** (:obj:`str`) -- If the model has an aggregator, this returns the aggregator. + """ + pipeline_inputs, sequential, padding, truncation = self._args_parser(*args, **kwargs) + batched_answers = [] + for pipeline_input in pipeline_inputs: + table, query = pipeline_input["table"], pipeline_input["query"] + inputs = self.tokenizer( + table, query, return_tensors=self.framework, truncation="drop_rows_to_fit", padding=padding + ) + + outputs = self.sequential_inference(**inputs) if sequential else self.batch_inference(**inputs) + + if self.aggregate: + logits, logits_agg = outputs[:2] + predictions = self.tokenizer.convert_logits_to_predictions(inputs, logits.detach(), logits_agg) + answer_coordinates_batch, agg_predictions = predictions + aggregators = {i: self.model.config.aggregation_labels[pred] for i, pred in enumerate(agg_predictions)} + + no_agg_label_index = self.model.config.no_aggregation_label_index + aggregators_prefix = { + i: aggregators[i] + " > " for i, pred in enumerate(agg_predictions) if pred != no_agg_label_index + } + else: + logits = outputs[0] + predictions = self.tokenizer.convert_logits_to_predictions(inputs, logits.detach()) + answer_coordinates_batch = predictions[0] + aggregators = {} + aggregators_prefix = {} + + answers = [] + for index, coordinates in enumerate(answer_coordinates_batch): + cells = [table.iat[coordinate] for coordinate in coordinates] + aggregator = aggregators.get(index, "") + aggregator_prefix = aggregators_prefix.get(index, "") + answer = { + "answer": aggregator_prefix + ", ".join(cells), + "coordinates": coordinates, + "cells": [table.iat[coordinate] for coordinate in coordinates], + } + if aggregator: + answer["aggregator"] = aggregator + + answers.append(answer) + batched_answers.append(answers if len(answers) > 1 else answers[0]) + return batched_answers if len(batched_answers) > 1 else batched_answers[0] + + @add_end_docstrings(PIPELINE_INIT_ARGS) class SummarizationPipeline(Pipeline): """ @@ -2752,6 +3021,18 @@ SUPPORTED_TASKS = { "model": {"pt": "distilbert-base-cased-distilled-squad", "tf": "distilbert-base-cased-distilled-squad"}, }, }, + "table-question-answering": { + "impl": TableQuestionAnsweringPipeline, + "pt": AutoModelForTableQuestionAnswering if is_torch_available() else None, + "tf": None, + "default": { + "model": { + "pt": "nielsr/tapas-base-finetuned-wtq", + "tokenizer": "nielsr/tapas-base-finetuned-wtq", + "tf": "nielsr/tapas-base-finetuned-wtq", + }, + }, + }, "fill-mask": { "impl": FillMaskPipeline, "tf": TFAutoModelForMaskedLM if is_tf_available() else None, @@ -3006,6 +3287,12 @@ def pipeline( "Model might be a PyTorch model (ending with `.bin`) but PyTorch is not available. " "Trying to load the model with Tensorflow." ) + + if model_class is None: + raise ValueError( + f"Pipeline using {framework} framework, but this framework is not supported by this pipeline." + ) + model = model_class.from_pretrained(model, config=config, revision=revision, **model_kwargs) if task == "translation" and model.config.task_specific_params: for key in model.config.task_specific_params: diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index afabe59b40..defb68b397 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -172,6 +172,19 @@ def require_torch(test_case): return test_case +def require_torch_scatter(test_case): + """ + Decorator marking a test that requires PyTorch scatter. + + These tests are skipped when PyTorch scatter isn't installed. + + """ + if not _scatter_available: + return unittest.skip("test requires PyTorch scatter")(test_case) + else: + return test_case + + def require_tf(test_case): """ Decorator marking a test that requires TensorFlow. diff --git a/tests/test_pipelines_table_question_answering.py b/tests/test_pipelines_table_question_answering.py new file mode 100644 index 0000000000..58dedb9575 --- /dev/null +++ b/tests/test_pipelines_table_question_answering.py @@ -0,0 +1,234 @@ +# Copyright 2020 The HuggingFace Team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import unittest + +from transformers.pipelines import Pipeline, pipeline +from transformers.testing_utils import require_pandas, require_torch, require_torch_scatter, slow + +from .test_pipelines_common import CustomInputPipelineCommonMixin + + +@require_torch_scatter +@require_torch +@require_pandas +class TQAPipelineTests(CustomInputPipelineCommonMixin, unittest.TestCase): + pipeline_task = "table-question-answering" + pipeline_running_kwargs = { + "padding": "max_length", + } + small_models = [ + "lysandre/tiny-tapas-random-wtq", + "lysandre/tiny-tapas-random-sqa", + ] + large_models = ["nielsr/tapas-base-finetuned-wtq"] # Models tested with the @slow decorator + valid_inputs = [ + { + "table": { + "actors": ["brad pitt", "leonardo di caprio", "george clooney"], + "age": ["56", "45", "59"], + "number of movies": ["87", "53", "69"], + "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"], + }, + "query": "how many movies has george clooney played in?", + }, + { + "table": { + "actors": ["brad pitt", "leonardo di caprio", "george clooney"], + "age": ["56", "45", "59"], + "number of movies": ["87", "53", "69"], + "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"], + }, + "query": ["how many movies has george clooney played in?", "how old is he?", "what's his date of birth?"], + }, + { + "table": { + "Repository": ["Transformers", "Datasets", "Tokenizers"], + "Stars": ["36542", "4512", "3934"], + "Contributors": ["651", "77", "34"], + "Programming language": ["Python", "Python", "Rust, Python and NodeJS"], + }, + "query": [ + "What repository has the largest number of stars?", + "Given that the numbers of stars defines if a repository is active, what repository is the most active?", + "What is the number of repositories?", + "What is the average number of stars?", + "What is the total amount of stars?", + ], + }, + ] + + def _test_pipeline(self, table_querier: Pipeline): + output_keys = {"answer", "coordinates", "cells"} + valid_inputs = self.valid_inputs + invalid_inputs = [ + {"query": "What does it do with empty context ?", "table": ""}, + {"query": "What does it do with empty context ?", "table": None}, + ] + self.assertIsNotNone(table_querier) + + mono_result = table_querier(valid_inputs[0]) + self.assertIsInstance(mono_result, dict) + + for key in output_keys: + self.assertIn(key, mono_result) + + multi_result = table_querier(valid_inputs) + self.assertIsInstance(multi_result, list) + for result in multi_result: + self.assertIsInstance(result, (list, dict)) + + for result in multi_result: + if isinstance(result, list): + for _result in result: + for key in output_keys: + self.assertIn(key, _result) + else: + for key in output_keys: + self.assertIn(key, result) + for bad_input in invalid_inputs: + self.assertRaises(ValueError, table_querier, bad_input) + self.assertRaises(ValueError, table_querier, invalid_inputs) + + def test_aggregation(self): + table_querier = pipeline( + "table-question-answering", + model="lysandre/tiny-tapas-random-wtq", + tokenizer="lysandre/tiny-tapas-random-wtq", + ) + self.assertIsInstance(table_querier.model.config.aggregation_labels, dict) + self.assertIsInstance(table_querier.model.config.no_aggregation_label_index, int) + + mono_result = table_querier(self.valid_inputs[0]) + multi_result = table_querier(self.valid_inputs) + + self.assertIn("aggregator", mono_result) + + for result in multi_result: + if isinstance(result, list): + for _result in result: + self.assertIn("aggregator", _result) + else: + self.assertIn("aggregator", result) + + def test_aggregation_with_sequential(self): + table_querier = pipeline( + "table-question-answering", + model="lysandre/tiny-tapas-random-wtq", + tokenizer="lysandre/tiny-tapas-random-wtq", + ) + self.assertIsInstance(table_querier.model.config.aggregation_labels, dict) + self.assertIsInstance(table_querier.model.config.no_aggregation_label_index, int) + + mono_result = table_querier(self.valid_inputs[0], sequential=True) + multi_result = table_querier(self.valid_inputs, sequential=True) + + self.assertIn("aggregator", mono_result) + + for result in multi_result: + if isinstance(result, list): + for _result in result: + self.assertIn("aggregator", _result) + else: + self.assertIn("aggregator", result) + + def test_sequential(self): + table_querier = pipeline( + "table-question-answering", + model="lysandre/tiny-tapas-random-sqa", + tokenizer="lysandre/tiny-tapas-random-sqa", + ) + sequential_mono_result_0 = table_querier(self.valid_inputs[0], sequential=True) + sequential_mono_result_1 = table_querier(self.valid_inputs[1], sequential=True) + sequential_multi_result = table_querier(self.valid_inputs, sequential=True) + mono_result_0 = table_querier(self.valid_inputs[0]) + mono_result_1 = table_querier(self.valid_inputs[1]) + multi_result = table_querier(self.valid_inputs) + + # First valid input has a single question, the dict should be equal + self.assertDictEqual(sequential_mono_result_0, mono_result_0) + + # Second valid input has several questions, the questions following the first one should not be equal + self.assertNotEqual(sequential_mono_result_1, mono_result_1) + + # Assert that we get the same results when passing in several sequences. + for index, (sequential_multi, multi) in enumerate(zip(sequential_multi_result, multi_result)): + if index == 0: + self.assertDictEqual(sequential_multi, multi) + else: + self.assertNotEqual(sequential_multi, multi) + + @slow + def test_integration_wtq(self): + tqa_pipeline = pipeline("table-question-answering") + + data = { + "Repository": ["Transformers", "Datasets", "Tokenizers"], + "Stars": ["36542", "4512", "3934"], + "Contributors": ["651", "77", "34"], + "Programming language": ["Python", "Python", "Rust, Python and NodeJS"], + } + queries = [ + "What repository has the largest number of stars?", + "Given that the numbers of stars defines if a repository is active, what repository is the most active?", + "What is the number of repositories?", + "What is the average number of stars?", + "What is the total amount of stars?", + ] + + results = tqa_pipeline(data, queries) + + expected_results = [ + {"answer": "Transformers", "coordinates": [(0, 0)], "cells": ["Transformers"]}, + {"answer": "Transformers", "coordinates": [(0, 0)], "cells": ["Transformers"]}, + { + "answer": "Transformers, Datasets, Tokenizers", + "coordinates": [(0, 0), (1, 0), (2, 0)], + "cells": ["Transformers", "Datasets", "Tokenizers"], + }, + { + "answer": "36542, 4512, 3934", + "coordinates": [(0, 1), (1, 1), (2, 1)], + "cells": ["36542", "4512", "3934"], + }, + { + "answer": "36542, 4512, 3934", + "coordinates": [(0, 1), (1, 1), (2, 1)], + "cells": ["36542", "4512", "3934"], + }, + ] + self.assertListEqual(results, expected_results) + + @slow + def test_integration_sqa(self): + tqa_pipeline = pipeline( + "table-question-answering", + model="nielsr/tapas-base-finetuned-sqa", + tokenizer="nielsr/tapas-base-finetuned-sqa", + ) + data = { + "Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], + "Age": ["56", "45", "59"], + "Number of movies": ["87", "53", "69"], + "Date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"], + } + queries = ["How many movies has George Clooney played in?", "How old is he?", "What's his date of birth?"] + results = tqa_pipeline(data, queries, sequential=True) + + expected_results = [ + {"answer": "69", "coordinates": [(2, 2)], "cells": ["69"]}, + {"answer": "59", "coordinates": [(2, 1)], "cells": ["59"]}, + {"answer": "28 november 1967", "coordinates": [(2, 3)], "cells": ["28 november 1967"]}, + ] + self.assertListEqual(results, expected_results)