diff --git a/.circleci/config.yml b/.circleci/config.yml
index d33e5b223b..cbcb3848e9 100644
--- a/.circleci/config.yml
+++ b/.circleci/config.yml
@@ -65,7 +65,7 @@ jobs:
run_tests_torch_and_tf:
working_directory: ~/transformers
docker:
- - image: circleci/python:3.6
+ - image: circleci/python:3.7
environment:
OMP_NUM_THREADS: 1
RUN_PT_TF_CROSS_TESTS: yes
@@ -82,6 +82,7 @@ jobs:
- run: pip install --upgrade pip
- run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]
- run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
+ - run: pip install tensorflow_probability
- save_cache:
key: v0.4-{{ checksum "setup.py" }}
paths:
@@ -118,6 +119,7 @@ jobs:
- run: pip install --upgrade pip
- run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision]
- run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html
+ - run: pip install tensorflow_probability
- save_cache:
key: v0.4-{{ checksum "setup.py" }}
paths:
@@ -278,6 +280,7 @@ jobs:
- v0.4-{{ checksum "setup.py" }}
- run: pip install --upgrade pip
- run: pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]
+ - run: pip install tensorflow_probability
- save_cache:
key: v0.4-tf-{{ checksum "setup.py" }}
paths:
@@ -311,6 +314,7 @@ jobs:
- v0.4-{{ checksum "setup.py" }}
- run: pip install --upgrade pip
- run: pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision]
+ - run: pip install tensorflow_probability
- save_cache:
key: v0.4-tf-{{ checksum "setup.py" }}
paths:
@@ -468,6 +472,7 @@ jobs:
- v0.4-{{ checksum "setup.py" }}
- run: pip install --upgrade pip
- run: pip install .[sklearn,tf-cpu,testing,sentencepiece]
+ - run: pip install tensorflow_probability
- save_cache:
key: v0.4-tf-{{ checksum "setup.py" }}
paths:
@@ -502,6 +507,7 @@ jobs:
- v0.4-{{ checksum "setup.py" }}
- run: pip install --upgrade pip
- run: pip install .[sklearn,tf-cpu,testing,sentencepiece]
+ - run: pip install tensorflow_probability
- save_cache:
key: v0.4-tf-{{ checksum "setup.py" }}
paths:
diff --git a/docs/source/index.rst b/docs/source/index.rst
index c2ce07b2a0..1c91216948 100644
--- a/docs/source/index.rst
+++ b/docs/source/index.rst
@@ -499,7 +499,7 @@ Flax), PyTorch, and/or TensorFlow.
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| T5 | ✅ | ✅ | ✅ | ✅ | ✅ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
-| TAPAS | ✅ | ❌ | ✅ | ❌ | ❌ |
+| TAPAS | ✅ | ❌ | ✅ | ✅ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
| Transformer-XL | ✅ | ❌ | ✅ | ✅ | ❌ |
+-----------------------------+----------------+----------------+-----------------+--------------------+--------------+
diff --git a/docs/source/model_doc/auto.rst b/docs/source/model_doc/auto.rst
index ef4f158740..ad69cf4dc0 100644
--- a/docs/source/model_doc/auto.rst
+++ b/docs/source/model_doc/auto.rst
@@ -265,6 +265,13 @@ TFAutoModelForMultipleChoice
:members:
+TFAutoModelForTableQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFAutoModelForTableQuestionAnswering
+ :members:
+
+
TFAutoModelForTokenClassification
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
diff --git a/docs/source/model_doc/tapas.rst b/docs/source/model_doc/tapas.rst
index d1cea3226a..face832935 100644
--- a/docs/source/model_doc/tapas.rst
+++ b/docs/source/model_doc/tapas.rst
@@ -49,7 +49,8 @@ entailment (a binary classification task). For more details, see their follow-up
intermediate pre-training `__ by Julian Martin Eisenschlos,
Syrine Krichene and Thomas Müller.
-This model was contributed by `nielsr `__. The original code can be found `here
+This model was contributed by `nielsr `__. The Tensorflow version of this model was
+contributed by `kamalkraj `__. The original code can be found `here
`__.
Tips:
@@ -130,6 +131,24 @@ for your environment):
>>> config = TapasConfig('google-base-finetuned-wikisql-supervised')
>>> model = TapasForQuestionAnswering.from_pretrained('google/tapas-base', config=config)
+In TensorFlow, this can be done as follows (make sure to have installed the `tensorflow_probability dependency
+__ for your environment):
+
+.. code-block::
+
+ >>> from transformers import TapasConfig, TFTapasForQuestionAnswering
+
+ >>> # for example, the base sized model with default SQA configuration
+ >>> model = TFTapasForQuestionAnswering.from_pretrained('google/tapas-base')
+
+ >>> # or, the base sized model with WTQ configuration
+ >>> config = TapasConfig.from_pretrained('google/tapas-base-finetuned-wtq')
+ >>> model = TFTapasForQuestionAnswering.from_pretrained('google/tapas-base', config=config)
+
+ >>> # or, the base sized model with WikiSQL configuration
+ >>> config = TapasConfig('google-base-finetuned-wikisql-supervised')
+ >>> model = TFTapasForQuestionAnswering.from_pretrained('google/tapas-base', config=config)
+
Of course, you don't necessarily have to follow one of these three ways in which TAPAS was fine-tuned. You can also
experiment by defining any hyperparameters you want when initializing :class:`~transformers.TapasConfig`, and then
@@ -142,10 +161,21 @@ way. Here's an example:
>>> from transformers import TapasConfig, TapasForQuestionAnswering
>>> # you can initialize the classification heads any way you want (see docs of TapasConfig)
- >>> config = TapasConfig(num_aggregation_labels=3, average_logits_per_cell=True, select_one_column=False)
+ >>> config = TapasConfig(num_aggregation_labels=3, average_logits_per_cell=True)
>>> # initializing the pre-trained base sized model with our custom classification heads
>>> model = TapasForQuestionAnswering.from_pretrained('google/tapas-base', config=config)
+And here is the equivalent code for TensorFlow:
+
+.. code-block::
+
+ >>> from transformers import TapasConfig, TFTapasForQuestionAnswering
+
+ >>> # you can initialize the classification heads any way you want (see docs of TapasConfig)
+ >>> config = TapasConfig(num_aggregation_labels=3, average_logits_per_cell=True)
+ >>> # initializing the pre-trained base sized model with our custom classification heads
+ >>> model = TFTapasForQuestionAnswering.from_pretrained('google/tapas-base', config=config)
+
What you can also do is start from an already fine-tuned checkpoint. A note here is that the already fine-tuned
checkpoint on WTQ has some issues due to the L2-loss which is somewhat brittle. See `here
`__ for more info.
@@ -180,12 +210,13 @@ SQA format. The author explains this `here
are not perfect (the ``answer_coordinates`` and ``float_answer`` fields are populated based on the ``answer_text``),
meaning that WTQ and WikiSQL results could actually be improved.
-**STEP 3: Convert your data into PyTorch tensors using TapasTokenizer**
+**STEP 3: Convert your data into PyTorch/TensorFlow tensors using TapasTokenizer**
Third, given that you've prepared your data in this TSV/CSV format (and corresponding CSV files containing the tabular
data), you can then use :class:`~transformers.TapasTokenizer` to convert table-question pairs into :obj:`input_ids`,
:obj:`attention_mask`, :obj:`token_type_ids` and so on. Again, based on which of the three cases you picked above,
-:class:`~transformers.TapasForQuestionAnswering` requires different inputs to be fine-tuned:
+:class:`~transformers.TapasForQuestionAnswering`/:class:`~transformers.TFTapasForQuestionAnswering` requires different
+inputs to be fine-tuned:
+------------------------------------+----------------------------------------------------------------------------------------------+
| **Task** | **Required inputs** |
@@ -220,6 +251,8 @@ are already in the TSV file of step 2. Here's an example:
{'input_ids': tensor([[ ... ]]), 'attention_mask': tensor([[...]]), 'token_type_ids': tensor([[[...]]]),
'numeric_values': tensor([[ ... ]]), 'numeric_values_scale: tensor([[ ... ]]), labels: tensor([[ ... ]])}
+Set `return_tensors='tf'` when calling the tokenizer to prepare data for the TF models.
+
Note that :class:`~transformers.TapasTokenizer` expects the data of the table to be **text-only**. You can use
``.astype(str)`` on a dataframe to turn it into text-only data. Of course, this only shows how to encode a single
training example. It is advised to create a PyTorch dataset and a corresponding dataloader:
@@ -261,15 +294,67 @@ training example. It is advised to create a PyTorch dataset and a corresponding
>>> train_dataset = TableDataset(data, tokenizer)
>>> train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32)
+And here is the equivalent code for TensorFlow:
+
+.. code-block::
+
+ >>> import tensorflow as tf
+ >>> import pandas as pd
+
+ >>> tsv_path = "your_path_to_the_tsv_file"
+ >>> table_csv_path = "your_path_to_a_directory_containing_all_csv_files"
+
+ >>> class TableDataset:
+ ... def __init__(self, data, tokenizer):
+ ... self.data = data
+ ... self.tokenizer = tokenizer
+ ...
+ ... def __iter__(self):
+ ... for idx in range(self.__len__()):
+ ... item = self.data.iloc[idx]
+ ... table = pd.read_csv(table_csv_path + item.table_file).astype(str) # be sure to make your table data text only
+ ... encoding = self.tokenizer(table=table,
+ ... queries=item.question,
+ ... answer_coordinates=item.answer_coordinates,
+ ... answer_text=item.answer_text,
+ ... truncation=True,
+ ... padding="max_length",
+ ... return_tensors="tf"
+ ... )
+ ... # remove the batch dimension which the tokenizer adds by default
+ ... encoding = {key: tf.squeeze(val,0) for key, val in encoding.items()}
+ ... # add the float_answer which is also required (weak supervision for aggregation case)
+ ... encoding["float_answer"] = tf.convert_to_tensor(item.float_answer,dtype=tf.float32)
+ ... yield encoding['input_ids'], encoding['attention_mask'], encoding['numeric_values'], \
+ ... encoding['numeric_values_scale'], encoding['token_type_ids'], encoding['labels'], \
+ ... encoding['float_answer']
+ ...
+ ... def __len__(self):
+ ... return len(self.data)
+
+ >>> data = pd.read_csv(tsv_path, sep='\t')
+ >>> train_dataset = TableDataset(data, tokenizer)
+ >>> output_signature = (
+ ... tf.TensorSpec(shape=(512,), dtype=tf.int32),
+ ... tf.TensorSpec(shape=(512,), dtype=tf.int32),
+ ... tf.TensorSpec(shape=(512,), dtype=tf.float32),
+ ... tf.TensorSpec(shape=(512,), dtype=tf.float32),
+ ... tf.TensorSpec(shape=(512,7), dtype=tf.int32),
+ ... tf.TensorSpec(shape=(512,), dtype=tf.int32),
+ ... tf.TensorSpec(shape=(512,), dtype=tf.float32))
+ >>> train_dataloader = tf.data.Dataset.from_generator(train_dataset, output_signature=output_signature).batch(32)
+
Note that here, we encode each table-question pair independently. This is fine as long as your dataset is **not
conversational**. In case your dataset involves conversational questions (such as in SQA), then you should first group
together the ``queries``, ``answer_coordinates`` and ``answer_text`` per table (in the order of their ``position``
index) and batch encode each table with its questions. This will make sure that the ``prev_labels`` token types (see
docs of :class:`~transformers.TapasTokenizer`) are set correctly. See `this notebook
`__
-for more info.
+for more info. See `this notebook
+`__
+for more info regarding using the TensorFlow model.
-**STEP 4: Train (fine-tune) TapasForQuestionAnswering**
+**STEP 4: Train (fine-tune) TapasForQuestionAnswering/TFTapasForQuestionAnswering**
You can then fine-tune :class:`~transformers.TapasForQuestionAnswering` using native PyTorch as follows (shown here for
the weak supervision for aggregation case):
@@ -316,6 +401,52 @@ the weak supervision for aggregation case):
... loss.backward()
... optimizer.step()
+
+Equivalently, fine-tuning :class:`~transformers.TFTapasForQuestionAnswering` in native TensorFlow can be done as
+follows (shown here for the weak supervision for aggregation case):
+
+.. code-block::
+
+ >>> import tensorflow as tf
+ >>> from transformers import TapasConfig, TFTapasForQuestionAnswering
+
+ >>> # this is the default WTQ configuration
+ >>> config = TapasConfig(
+ ... num_aggregation_labels = 4,
+ ... use_answer_as_supervision = True,
+ ... answer_loss_cutoff = 0.664694,
+ ... cell_selection_preference = 0.207951,
+ ... huber_loss_delta = 0.121194,
+ ... init_cell_selection_weights_to_zero = True,
+ ... select_one_column = True,
+ ... allow_empty_column_selection = False,
+ ... temperature = 0.0352513,
+ ... )
+ >>> model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config)
+
+ >>> optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
+
+ >>> for epoch in range(2): # loop over the dataset multiple times
+ ... for idx, batch in enumerate(train_dataloader):
+ ... # get the inputs;
+ ... input_ids = batch[0]
+ ... attention_mask = batch[1]
+ ... token_type_ids = batch[4]
+ ... labels = batch[-1]
+ ... numeric_values = batch[2]
+ ... numeric_values_scale = batch[3]
+ ... float_answer = batch[6]
+
+ ... # forward + backward + optimize
+ ... with tf.GradientTape() as tape:
+ ... outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids,
+ ... labels=labels, numeric_values=numeric_values, numeric_values_scale=numeric_values_scale,
+ ... float_answer=float_answer )
+ ... grads = tape.gradient(outputs.loss, model.trainable_weights)
+ ... optimizer.apply_gradients(zip(grads, model.trainable_weights))
+
+
+
Usage: inference
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
@@ -380,10 +511,68 @@ of that:
What is the total number of movies?
Predicted answer: SUM > 87, 53, 69
+
+And here is the equivalent code for TensorFlow:
+
+.. code-block::
+
+ >>> from transformers import TapasTokenizer, TFTapasForQuestionAnswering
+ >>> import pandas as pd
+
+ >>> model_name = 'google/tapas-base-finetuned-wtq'
+ >>> model = TFTapasForQuestionAnswering.from_pretrained(model_name)
+ >>> tokenizer = TapasTokenizer.from_pretrained(model_name)
+
+ >>> data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], 'Number of movies': ["87", "53", "69"]}
+ >>> queries = ["What is the name of the first actor?", "How many movies has George Clooney played in?", "What is the total number of movies?"]
+ >>> table = pd.DataFrame.from_dict(data)
+ >>> inputs = tokenizer(table=table, queries=queries, padding='max_length', return_tensors="tf")
+ >>> outputs = model(**inputs)
+ >>> predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
+ ... inputs,
+ ... outputs.logits,
+ ... outputs.logits_aggregation
+ ... )
+
+ >>> # let's print out the results:
+ >>> id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3:"COUNT"}
+ >>> aggregation_predictions_string = [id2aggregation[x] for x in predicted_aggregation_indices]
+
+ >>> answers = []
+ >>> for coordinates in predicted_answer_coordinates:
+ ... if len(coordinates) == 1:
+ ... # only a single cell:
+ ... answers.append(table.iat[coordinates[0]])
+ ... else:
+ ... # multiple cells
+ ... cell_values = []
+ ... for coordinate in coordinates:
+ ... cell_values.append(table.iat[coordinate])
+ ... answers.append(", ".join(cell_values))
+
+ >>> display(table)
+ >>> print("")
+ >>> for query, answer, predicted_agg in zip(queries, answers, aggregation_predictions_string):
+ ... print(query)
+ ... if predicted_agg == "NONE":
+ ... print("Predicted answer: " + answer)
+ ... else:
+ ... print("Predicted answer: " + predicted_agg + " > " + answer)
+ What is the name of the first actor?
+ Predicted answer: Brad Pitt
+ How many movies has George Clooney played in?
+ Predicted answer: COUNT > 69
+ What is the total number of movies?
+ Predicted answer: SUM > 87, 53, 69
+
+
In case of a conversational set-up, then each table-question pair must be provided **sequentially** to the model, such
that the ``prev_labels`` token types can be overwritten by the predicted ``labels`` of the previous table-question
pair. Again, more info can be found in `this notebook
-`__.
+`__
+(for PyTorch) and `this notebook
+`__
+(for TensorFlow).
Tapas specific outputs
@@ -433,3 +622,31 @@ TapasForQuestionAnswering
.. autoclass:: transformers.TapasForQuestionAnswering
:members: forward
+
+
+TFTapasModel
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFTapasModel
+ :members: call
+
+
+TFTapasForMaskedLM
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFTapasForMaskedLM
+ :members: call
+
+
+TFTapasForSequenceClassification
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFTapasForSequenceClassification
+ :members: call
+
+
+TFTapasForQuestionAnswering
+~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
+
+.. autoclass:: transformers.TFTapasForQuestionAnswering
+ :members: call
diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py
index 5595bd1c9b..a6d0b2a2eb 100755
--- a/src/transformers/__init__.py
+++ b/src/transformers/__init__.py
@@ -1446,6 +1446,7 @@ if is_tf_available():
"TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING",
"TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING",
"TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING",
+ "TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
"TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
"TF_MODEL_MAPPING",
"TF_MODEL_WITH_LM_HEAD_MAPPING",
@@ -1458,6 +1459,7 @@ if is_tf_available():
"TFAutoModelForQuestionAnswering",
"TFAutoModelForSeq2SeqLM",
"TFAutoModelForSequenceClassification",
+ "TFAutoModelForTableQuestionAnswering",
"TFAutoModelForTokenClassification",
"TFAutoModelWithLMHead",
]
@@ -1767,6 +1769,16 @@ if is_tf_available():
"TFT5PreTrainedModel",
]
)
+ _import_structure["models.tapas"].extend(
+ [
+ "TF_TAPAS_PRETRAINED_MODEL_ARCHIVE_LIST",
+ "TFTapasForMaskedLM",
+ "TFTapasForQuestionAnswering",
+ "TFTapasForSequenceClassification",
+ "TFTapasModel",
+ "TFTapasPreTrainedModel",
+ ]
+ )
_import_structure["models.transfo_xl"].extend(
[
"TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST",
@@ -3225,6 +3237,7 @@ if TYPE_CHECKING:
TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
+ TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
TF_MODEL_MAPPING,
TF_MODEL_WITH_LM_HEAD_MAPPING,
@@ -3237,6 +3250,7 @@ if TYPE_CHECKING:
TFAutoModelForQuestionAnswering,
TFAutoModelForSeq2SeqLM,
TFAutoModelForSequenceClassification,
+ TFAutoModelForTableQuestionAnswering,
TFAutoModelForTokenClassification,
TFAutoModelWithLMHead,
)
@@ -3483,6 +3497,14 @@ if TYPE_CHECKING:
TFT5Model,
TFT5PreTrainedModel,
)
+ from .models.tapas import (
+ TF_TAPAS_PRETRAINED_MODEL_ARCHIVE_LIST,
+ TFTapasForMaskedLM,
+ TFTapasForQuestionAnswering,
+ TFTapasForSequenceClassification,
+ TFTapasModel,
+ TFTapasPreTrainedModel,
+ )
from .models.transfo_xl import (
TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST,
TFAdaptiveEmbedding,
diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py
index 5d99b36c14..5294f3aab7 100644
--- a/src/transformers/file_utils.py
+++ b/src/transformers/file_utils.py
@@ -213,6 +213,14 @@ except importlib_metadata.PackageNotFoundError:
_soundfile_available = False
+_tensorflow_probability_available = importlib.util.find_spec("tensorflow_probability") is not None
+try:
+ _tensorflow_probability_version = importlib_metadata.version("tensorflow_probability")
+ logger.debug(f"Successfully imported tensorflow-probability version {_tensorflow_probability_version}")
+except importlib_metadata.PackageNotFoundError:
+ _tensorflow_probability_available = False
+
+
_timm_available = importlib.util.find_spec("timm") is not None
try:
_timm_version = importlib_metadata.version("timm")
@@ -444,6 +452,10 @@ def is_pytorch_quantization_available():
return _pytorch_quantization_available
+def is_tensorflow_probability_available():
+ return _tensorflow_probability_available
+
+
def is_pandas_available():
return importlib.util.find_spec("pandas") is not None
@@ -629,6 +641,12 @@ PYTORCH_QUANTIZATION_IMPORT_ERROR = """
`pip install pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com`
"""
+# docstyle-ignore
+TENSORFLOW_PROBABILITY_IMPORT_ERROR = """
+{0} requires the tensorflow_probability library but it was not found in your environment. You can install it with pip as
+explained here: https://github.com/tensorflow/probability.
+"""
+
# docstyle-ignore
PANDAS_IMPORT_ERROR = """
@@ -684,6 +702,7 @@ BACKENDS_MAPPING = OrderedDict(
("sentencepiece", (is_sentencepiece_available, SENTENCEPIECE_IMPORT_ERROR)),
("sklearn", (is_sklearn_available, SKLEARN_IMPORT_ERROR)),
("speech", (is_speech_available, SPEECH_IMPORT_ERROR)),
+ ("tensorflow_probability", (is_tensorflow_probability_available, TENSORFLOW_PROBABILITY_IMPORT_ERROR)),
("tf", (is_tf_available, TENSORFLOW_IMPORT_ERROR)),
("timm", (is_timm_available, TIMM_IMPORT_ERROR)),
("tokenizers", (is_tokenizers_available, TOKENIZERS_IMPORT_ERROR)),
diff --git a/src/transformers/modeling_tf_pytorch_utils.py b/src/transformers/modeling_tf_pytorch_utils.py
index 9ed2a0a1cd..037525979b 100644
--- a/src/transformers/modeling_tf_pytorch_utils.py
+++ b/src/transformers/modeling_tf_pytorch_utils.py
@@ -399,7 +399,9 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F
raise e
# logger.warning(f"Initialize PyTorch weight {pt_weight_name}")
-
+ # Make sure we have a proper numpy array
+ if numpy.isscalar(array):
+ array = numpy.array(array)
new_pt_params_dict[pt_weight_name] = torch.from_numpy(array)
loaded_pt_weights_data_ptr[pt_weight.data_ptr()] = torch.from_numpy(array)
all_tf_weights.discard(pt_weight_name)
diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py
index ba4ba2dd7f..bd6e3a369b 100644
--- a/src/transformers/models/auto/__init__.py
+++ b/src/transformers/models/auto/__init__.py
@@ -83,6 +83,7 @@ if is_tf_available():
"TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING",
"TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING",
"TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING",
+ "TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING",
"TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING",
"TF_MODEL_MAPPING",
"TF_MODEL_WITH_LM_HEAD_MAPPING",
@@ -95,6 +96,7 @@ if is_tf_available():
"TFAutoModelForQuestionAnswering",
"TFAutoModelForSeq2SeqLM",
"TFAutoModelForSequenceClassification",
+ "TFAutoModelForTableQuestionAnswering",
"TFAutoModelForTokenClassification",
"TFAutoModelWithLMHead",
]
@@ -189,6 +191,7 @@ if TYPE_CHECKING:
TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
+ TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
TF_MODEL_MAPPING,
TF_MODEL_WITH_LM_HEAD_MAPPING,
@@ -201,6 +204,7 @@ if TYPE_CHECKING:
TFAutoModelForQuestionAnswering,
TFAutoModelForSeq2SeqLM,
TFAutoModelForSequenceClassification,
+ TFAutoModelForTableQuestionAnswering,
TFAutoModelForTokenClassification,
TFAutoModelWithLMHead,
)
diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py
index 0ff9eed872..d32abfdd8a 100644
--- a/src/transformers/models/auto/modeling_tf_auto.py
+++ b/src/transformers/models/auto/modeling_tf_auto.py
@@ -59,6 +59,7 @@ TF_MODEL_MAPPING_NAMES = OrderedDict(
("funnel", ("TFFunnelModel", "TFFunnelBaseModel")),
("dpr", "TFDPRQuestionEncoder"),
("mpnet", "TFMPNetModel"),
+ ("tapas", "TFTapasModel"),
("mbart", "TFMBartModel"),
("marian", "TFMarianModel"),
("pegasus", "TFPegasusModel"),
@@ -92,6 +93,7 @@ TF_MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict(
("xlm", "TFXLMWithLMHeadModel"),
("ctrl", "TFCTRLLMHeadModel"),
("electra", "TFElectraForPreTraining"),
+ ("tapas", "TFTapasForMaskedLM"),
("funnel", "TFFunnelForPreTraining"),
("mpnet", "TFMPNetForMaskedLM"),
]
@@ -124,6 +126,7 @@ TF_MODEL_WITH_LM_HEAD_MAPPING_NAMES = OrderedDict(
("xlm", "TFXLMWithLMHeadModel"),
("ctrl", "TFCTRLLMHeadModel"),
("electra", "TFElectraForMaskedLM"),
+ ("tapas", "TFTapasForMaskedLM"),
("funnel", "TFFunnelForMaskedLM"),
("mpnet", "TFMPNetForMaskedLM"),
]
@@ -172,6 +175,7 @@ TF_MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict(
("flaubert", "TFFlaubertWithLMHeadModel"),
("xlm", "TFXLMWithLMHeadModel"),
("electra", "TFElectraForMaskedLM"),
+ ("tapas", "TFTapasForMaskedLM"),
("funnel", "TFFunnelForMaskedLM"),
("mpnet", "TFMPNetForMaskedLM"),
]
@@ -215,6 +219,7 @@ TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
("flaubert", "TFFlaubertForSequenceClassification"),
("xlm", "TFXLMForSequenceClassification"),
("electra", "TFElectraForSequenceClassification"),
+ ("tapas", "TFTapasForSequenceClassification"),
("funnel", "TFFunnelForSequenceClassification"),
("gpt2", "TFGPT2ForSequenceClassification"),
("mpnet", "TFMPNetForSequenceClassification"),
@@ -249,6 +254,14 @@ TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
]
)
+TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict(
+ [
+ # Model for Table Question Answering mapping
+ ("tapas", "TFTapasForQuestionAnswering"),
+ ]
+)
+
+
TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict(
[
# Model for Token Classification mapping
@@ -323,6 +336,9 @@ TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = _LazyAutoMapping(
TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES
)
+TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping(
+ CONFIG_MAPPING_NAMES, TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES
+)
TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = _LazyAutoMapping(
CONFIG_MAPPING_NAMES, TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES
)
@@ -402,6 +418,17 @@ class TFAutoModelForQuestionAnswering(_BaseAutoModelClass):
TFAutoModelForQuestionAnswering = auto_class_update(TFAutoModelForQuestionAnswering, head_doc="question answering")
+class TFAutoModelForTableQuestionAnswering(_BaseAutoModelClass):
+ _model_mapping = TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING
+
+
+TFAutoModelForTableQuestionAnswering = auto_class_update(
+ TFAutoModelForTableQuestionAnswering,
+ head_doc="table question answering",
+ checkpoint_for_example="google/tapas-base-finetuned-wtq",
+)
+
+
class TFAutoModelForTokenClassification(_BaseAutoModelClass):
_model_mapping = TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING
diff --git a/src/transformers/models/tapas/__init__.py b/src/transformers/models/tapas/__init__.py
index 3949ed6fba..9f1d442b98 100644
--- a/src/transformers/models/tapas/__init__.py
+++ b/src/transformers/models/tapas/__init__.py
@@ -18,7 +18,7 @@
from typing import TYPE_CHECKING
-from ...file_utils import _LazyModule, is_torch_available
+from ...file_utils import _LazyModule, is_tf_available, is_torch_available
_import_structure = {
@@ -36,6 +36,15 @@ if is_torch_available():
"TapasPreTrainedModel",
"load_tf_weights_in_tapas",
]
+if is_tf_available():
+ _import_structure["modeling_tf_tapas"] = [
+ "TF_TAPAS_PRETRAINED_MODEL_ARCHIVE_LIST",
+ "TFTapasForMaskedLM",
+ "TFTapasForQuestionAnswering",
+ "TFTapasForSequenceClassification",
+ "TFTapasModel",
+ "TFTapasPreTrainedModel",
+ ]
if TYPE_CHECKING:
@@ -53,6 +62,17 @@ if TYPE_CHECKING:
load_tf_weights_in_tapas,
)
+ if is_tf_available():
+ from .modeling_tf_tapas import (
+ TF_TAPAS_PRETRAINED_MODEL_ARCHIVE_LIST,
+ TFTapasForMaskedLM,
+ TFTapasForQuestionAnswering,
+ TFTapasForSequenceClassification,
+ TFTapasModel,
+ TFTapasPreTrainedModel,
+ )
+
+
else:
import sys
diff --git a/src/transformers/models/tapas/modeling_tf_tapas.py b/src/transformers/models/tapas/modeling_tf_tapas.py
new file mode 100644
index 0000000000..ada1194067
--- /dev/null
+++ b/src/transformers/models/tapas/modeling_tf_tapas.py
@@ -0,0 +1,2398 @@
+# coding=utf-8
+# Copyright 2021 Google Research and The HuggingFace Inc. team.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""TF 2.0 TAPAS model. """
+
+import enum
+import math
+from dataclasses import dataclass
+from typing import Dict, Optional, Tuple, Union
+
+import numpy as np
+import tensorflow as tf
+
+from ...activations_tf import get_tf_activation
+from ...file_utils import (
+ ModelOutput,
+ add_code_sample_docstrings,
+ add_start_docstrings,
+ add_start_docstrings_to_model_forward,
+ is_tensorflow_probability_available,
+ requires_backends,
+)
+from ...modeling_tf_outputs import (
+ TFBaseModelOutputWithPastAndCrossAttentions,
+ TFBaseModelOutputWithPooling,
+ TFMaskedLMOutput,
+ TFSequenceClassifierOutput,
+)
+from ...modeling_tf_utils import (
+ TFMaskedLanguageModelingLoss,
+ TFModelInputType,
+ TFPreTrainedModel,
+ TFSequenceClassificationLoss,
+ get_initializer,
+ input_processing,
+ keras_serializable,
+ shape_list,
+)
+from ...utils import logging
+from .configuration_tapas import TapasConfig
+
+
+logger = logging.get_logger(__name__)
+
+# soft dependency
+if is_tensorflow_probability_available():
+ try:
+ import tensorflow_probability as tfp
+
+ # On the first call, check whether a compatible version of TensorFlow is installed
+ # TensorFlow Probability depends on a recent stable release of TensorFlow
+ n = tfp.distributions.Normal(loc=0.0, scale=1.0)
+ except ImportError:
+ logger.error(
+ "TAPAS models are not usable since `tensorflow_probability` can't be loaded."
+ "It seems you have `tensorflow_probability` installed with the wrong tensorflow version."
+ "Please try to reinstall it following the instructions here: https://github.com/tensorflow/probability."
+ )
+
+_CONFIG_FOR_DOC = "TapasConfig"
+_TOKENIZER_FOR_DOC = "TapasTokenizer"
+_CHECKPOINT_FOR_DOC = "google/tapas-base"
+
+TF_TAPAS_PRETRAINED_MODEL_ARCHIVE_LIST = [
+ # large models
+ "google/tapas-large",
+ "google/tapas-large-finetuned-sqa",
+ "google/tapas-large-finetuned-wtq",
+ "google/tapas-large-finetuned-wikisql-supervised",
+ "google/tapas-large-finetuned-tabfact",
+ # base models
+ "google/tapas-base",
+ "google/tapas-base-finetuned-sqa",
+ "google/tapas-base-finetuned-wtq",
+ "google/tapas-base-finetuned-wikisql-supervised",
+ "google/tapas-base-finetuned-tabfact",
+ # small models
+ "google/tapas-small",
+ "google/tapas-small-finetuned-sqa",
+ "google/tapas-small-finetuned-wtq",
+ "google/tapas-small-finetuned-wikisql-supervised",
+ "google/tapas-small-finetuned-tabfact",
+ # mini models
+ "google/tapas-mini",
+ "google/tapas-mini-finetuned-sqa",
+ "google/tapas-mini-finetuned-wtq",
+ "google/tapas-mini-finetuned-wikisql-supervised",
+ "google/tapas-mini-finetuned-tabfact",
+ # tiny models
+ "google/tapas-tiny",
+ "google/tapas-tiny-finetuned-sqa",
+ "google/tapas-tiny-finetuned-wtq",
+ "google/tapas-tiny-finetuned-wikisql-supervised",
+ "google/tapas-tiny-finetuned-tabfact",
+ # See all TAPAS models at https://huggingface.co/models?filter=tapas
+]
+
+EPSILON_ZERO_DIVISION = 1e-10
+CLOSE_ENOUGH_TO_LOG_ZERO = -10000.0
+
+
+@dataclass
+class TFTableQuestionAnsweringOutput(ModelOutput):
+ """
+ Output type of :class:`~transformers.TFTapasForQuestionAnswering`.
+
+ Args:
+ loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` (and possibly :obj:`answer`, :obj:`aggregation_labels`, :obj:`numeric_values` and :obj:`numeric_values_scale` are provided)):
+ Total loss as the sum of the hierarchical cell selection log-likelihood loss and (optionally) the
+ semi-supervised regression loss and (optionally) supervised loss for aggregations.
+ logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+ Prediction scores of the cell selection head, for every token.
+ logits_aggregation (:obj:`tf.Tensor`, `optional`, of shape :obj:`(batch_size, num_aggregation_labels)`):
+ Prediction scores of the aggregation head, for every aggregation operator.
+ hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``):
+ Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of
+ shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each
+ layer plus the initial embedding outputs.
+ attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``):
+ Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length,
+ sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in
+ the self-attention heads.
+ """
+
+ loss: Optional[tf.Tensor] = None
+ logits: tf.Tensor = None
+ logits_aggregation: Optional[tf.Tensor] = None
+ hidden_states: Optional[Tuple[tf.Tensor]] = None
+ attentions: Optional[Tuple[tf.Tensor]] = None
+
+
+class TFTapasEmbeddings(tf.keras.layers.Layer):
+ """
+ Construct the embeddings from word, position and token_type embeddings. Same as BertEmbeddings but with a number of
+ additional token type embeddings to encode tabular structure.
+ """
+
+ def __init__(self, config: TapasConfig, **kwargs):
+ super().__init__(**kwargs)
+
+ self.vocab_size = config.vocab_size
+ self.type_vocab_sizes = config.type_vocab_sizes
+ self.number_of_token_type_embeddings = len(config.type_vocab_sizes)
+ self.reset_position_index_per_cell = config.reset_position_index_per_cell
+ self.hidden_size = config.hidden_size
+ self.max_position_embeddings = config.max_position_embeddings
+ self.initializer_range = config.initializer_range
+ self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+ self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
+
+ def build(self, input_shape: tf.TensorShape):
+ with tf.name_scope("word_embeddings"):
+ self.weight = self.add_weight(
+ name="weight",
+ shape=[self.vocab_size, self.hidden_size],
+ initializer=get_initializer(self.initializer_range),
+ )
+
+ with tf.name_scope("position_embeddings"):
+ self.position_embeddings = self.add_weight(
+ name="embeddings",
+ shape=[self.max_position_embeddings, self.hidden_size],
+ initializer=get_initializer(self.initializer_range),
+ )
+ for i, type_vocab_size in enumerate(self.type_vocab_sizes):
+ with tf.name_scope(f"token_type_embeddings_{i}"):
+ setattr(
+ self,
+ f"token_type_embeddings_{i}",
+ self.add_weight(
+ name="embeddings",
+ shape=[type_vocab_size, self.hidden_size],
+ initializer=get_initializer(self.initializer_range),
+ ),
+ )
+
+ super().build(input_shape)
+
+ def call(
+ self,
+ input_ids: tf.Tensor = None,
+ position_ids: tf.Tensor = None,
+ token_type_ids: tf.Tensor = None,
+ inputs_embeds: tf.Tensor = None,
+ training: bool = False,
+ ) -> tf.Tensor:
+ """
+ Applies embedding based on inputs tensor.
+
+ Returns:
+ final_embeddings (:obj:`tf.Tensor`): output embedding tensor.
+ """
+ assert not (input_ids is None and inputs_embeds is None)
+ if input_ids is not None:
+ input_shape = shape_list(input_ids)
+ else:
+ input_shape = shape_list(inputs_embeds)[:-1]
+
+ seq_length = input_shape[1]
+
+ if token_type_ids is None:
+ token_type_ids = tf.fill(dims=input_shape + [self.number_of_token_type_embeddings], value=0)
+
+ if position_ids is None:
+ # create absolute position embeddings
+ position_ids = tf.expand_dims(tf.range(start=0, limit=seq_length), axis=0)
+ position_ids = tf.broadcast_to(position_ids, shape=input_shape)
+ # when self.config.reset_position_index_per_cell is set to True, create relative position embeddings
+ if self.reset_position_index_per_cell:
+
+ # shape (batch_size, seq_len)
+ col_index = IndexMap(token_type_ids[:, :, 1], self.type_vocab_sizes[1], batch_dims=1)
+ # shape (batch_size, seq_len)
+ row_index = IndexMap(token_type_ids[:, :, 2], self.type_vocab_sizes[2], batch_dims=1)
+ # shape (batch_size, seq_len)
+ full_index = ProductIndexMap(col_index, row_index)
+ # shape (max_rows * max_columns,). First absolute position for every cell
+ first_position_per_segment = reduce_min(position_ids, full_index)[0]
+ # ? shape (batch_size, seq_len). First absolute position of the cell for every token
+ first_position = gather(first_position_per_segment, full_index)
+ # shape (1, seq_len)
+ position = tf.expand_dims(tf.range(start=0, limit=seq_length), axis=0)
+ position_ids = tf.math.minimum(self.max_position_embeddings - 1, position - first_position)
+
+ if input_ids is not None:
+ inputs_embeds = tf.gather(params=self.weight, indices=input_ids)
+
+ position_embeddings = tf.gather(self.position_embeddings, indices=position_ids)
+
+ final_embeddings = inputs_embeds + position_embeddings
+
+ for i in range(self.number_of_token_type_embeddings):
+ name = f"token_type_embeddings_{i}"
+ final_embeddings += tf.gather(params=getattr(self, name), indices=token_type_ids[:, :, i])
+
+ final_embeddings = self.LayerNorm(inputs=final_embeddings)
+ final_embeddings = self.dropout(inputs=final_embeddings, training=training)
+
+ return final_embeddings
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Tapas
+class TFTapasSelfAttention(tf.keras.layers.Layer):
+ def __init__(self, config: TapasConfig, **kwargs):
+ super().__init__(**kwargs)
+
+ if config.hidden_size % config.num_attention_heads != 0:
+ raise ValueError(
+ f"The hidden size ({config.hidden_size}) is not a multiple of the number "
+ f"of attention heads ({config.num_attention_heads})"
+ )
+
+ self.num_attention_heads = config.num_attention_heads
+ self.attention_head_size = int(config.hidden_size / config.num_attention_heads)
+ self.all_head_size = self.num_attention_heads * self.attention_head_size
+ self.sqrt_att_head_size = math.sqrt(self.attention_head_size)
+
+ self.query = tf.keras.layers.Dense(
+ units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query"
+ )
+ self.key = tf.keras.layers.Dense(
+ units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key"
+ )
+ self.value = tf.keras.layers.Dense(
+ units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value"
+ )
+ self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob)
+
+ self.is_decoder = config.is_decoder
+
+ def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor:
+ # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size]
+ tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size))
+
+ # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size]
+ return tf.transpose(tensor, perm=[0, 2, 1, 3])
+
+ def call(
+ self,
+ hidden_states: tf.Tensor,
+ attention_mask: tf.Tensor,
+ head_mask: tf.Tensor,
+ encoder_hidden_states: tf.Tensor,
+ encoder_attention_mask: tf.Tensor,
+ past_key_value: Tuple[tf.Tensor],
+ output_attentions: bool,
+ training: bool = False,
+ ) -> Tuple[tf.Tensor]:
+ batch_size = shape_list(hidden_states)[0]
+ mixed_query_layer = self.query(inputs=hidden_states)
+
+ # If this is instantiated as a cross-attention module, the keys
+ # and values come from an encoder; the attention mask needs to be
+ # such that the encoder's padding tokens are not attended to.
+ is_cross_attention = encoder_hidden_states is not None
+
+ if is_cross_attention and past_key_value is not None:
+ # reuse k,v, cross_attentions
+ key_layer = past_key_value[0]
+ value_layer = past_key_value[1]
+ attention_mask = encoder_attention_mask
+ elif is_cross_attention:
+ key_layer = self.transpose_for_scores(self.key(inputs=encoder_hidden_states), batch_size)
+ value_layer = self.transpose_for_scores(self.value(inputs=encoder_hidden_states), batch_size)
+ attention_mask = encoder_attention_mask
+ elif past_key_value is not None:
+ key_layer = self.transpose_for_scores(self.key(inputs=hidden_states), batch_size)
+ value_layer = self.transpose_for_scores(self.value(inputs=hidden_states), batch_size)
+ key_layer = tf.concatenate([past_key_value[0], key_layer], dim=2)
+ value_layer = tf.concatenate([past_key_value[1], value_layer], dim=2)
+ else:
+ key_layer = self.transpose_for_scores(self.key(inputs=hidden_states), batch_size)
+ value_layer = self.transpose_for_scores(self.value(inputs=hidden_states), batch_size)
+
+ query_layer = self.transpose_for_scores(mixed_query_layer, batch_size)
+
+ if self.is_decoder:
+ # if cross_attention save Tuple(tf.Tensor, tf.Tensor) of all cross attention key/value_states.
+ # Further calls to cross_attention layer can then reuse all cross-attention
+ # key/value_states (first "if" case)
+ # if uni-directional self-attention (decoder) save Tuple(tf.Tensor, tf.Tensor) of
+ # all previous decoder key/value_states. Further calls to uni-directional self-attention
+ # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case)
+ # if encoder bi-directional self-attention `past_key_value` is always `None`
+ past_key_value = (key_layer, value_layer)
+
+ # Take the dot product between "query" and "key" to get the raw attention scores.
+ # (batch size, num_heads, seq_len_q, seq_len_k)
+ attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True)
+ dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype)
+ attention_scores = tf.divide(attention_scores, dk)
+
+ if attention_mask is not None:
+ # Apply the attention mask is (precomputed for all layers in TFTapasModel call() function)
+ attention_scores = tf.add(attention_scores, attention_mask)
+
+ # Normalize the attention scores to probabilities.
+ attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1)
+
+ # This is actually dropping out entire tokens to attend to, which might
+ # seem a bit unusual, but is taken from the original Transformer paper.
+ attention_probs = self.dropout(inputs=attention_probs, training=training)
+
+ # Mask heads if we want to
+ if head_mask is not None:
+ attention_probs = tf.multiply(attention_probs, head_mask)
+
+ attention_output = tf.matmul(attention_probs, value_layer)
+ attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3])
+
+ # (batch_size, seq_len_q, all_head_size)
+ attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.all_head_size))
+ outputs = (attention_output, attention_probs) if output_attentions else (attention_output,)
+
+ if self.is_decoder:
+ outputs = outputs + (past_key_value,)
+ return outputs
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Tapas
+class TFTapasSelfOutput(tf.keras.layers.Layer):
+ def __init__(self, config: TapasConfig, **kwargs):
+ super().__init__(**kwargs)
+
+ self.dense = tf.keras.layers.Dense(
+ units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+ )
+ self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+ self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
+
+ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
+ hidden_states = self.dense(inputs=hidden_states)
+ hidden_states = self.dropout(inputs=hidden_states, training=training)
+ hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
+
+ return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Tapas
+class TFTapasAttention(tf.keras.layers.Layer):
+ def __init__(self, config: TapasConfig, **kwargs):
+ super().__init__(**kwargs)
+
+ self.self_attention = TFTapasSelfAttention(config, name="self")
+ self.dense_output = TFTapasSelfOutput(config, name="output")
+
+ def prune_heads(self, heads):
+ raise NotImplementedError
+
+ def call(
+ self,
+ input_tensor: tf.Tensor,
+ attention_mask: tf.Tensor,
+ head_mask: tf.Tensor,
+ encoder_hidden_states: tf.Tensor,
+ encoder_attention_mask: tf.Tensor,
+ past_key_value: Tuple[tf.Tensor],
+ output_attentions: bool,
+ training: bool = False,
+ ) -> Tuple[tf.Tensor]:
+ self_outputs = self.self_attention(
+ hidden_states=input_tensor,
+ attention_mask=attention_mask,
+ head_mask=head_mask,
+ encoder_hidden_states=encoder_hidden_states,
+ encoder_attention_mask=encoder_attention_mask,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ training=training,
+ )
+ attention_output = self.dense_output(
+ hidden_states=self_outputs[0], input_tensor=input_tensor, training=training
+ )
+ # add attentions (possibly with past_key_value) if we output them
+ outputs = (attention_output,) + self_outputs[1:]
+
+ return outputs
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Tapas
+class TFTapasIntermediate(tf.keras.layers.Layer):
+ def __init__(self, config: TapasConfig, **kwargs):
+ super().__init__(**kwargs)
+
+ self.dense = tf.keras.layers.Dense(
+ units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+ )
+
+ if isinstance(config.hidden_act, str):
+ self.intermediate_act_fn = get_tf_activation(config.hidden_act)
+ else:
+ self.intermediate_act_fn = config.hidden_act
+
+ def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+ hidden_states = self.dense(inputs=hidden_states)
+ hidden_states = self.intermediate_act_fn(hidden_states)
+
+ return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Tapas
+class TFTapasOutput(tf.keras.layers.Layer):
+ def __init__(self, config: TapasConfig, **kwargs):
+ super().__init__(**kwargs)
+
+ self.dense = tf.keras.layers.Dense(
+ units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense"
+ )
+ self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+ self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob)
+
+ def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor:
+ hidden_states = self.dense(inputs=hidden_states)
+ hidden_states = self.dropout(inputs=hidden_states, training=training)
+ hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor)
+
+ return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Tapas
+class TFTapasLayer(tf.keras.layers.Layer):
+ def __init__(self, config: TapasConfig, **kwargs):
+ super().__init__(**kwargs)
+
+ self.attention = TFTapasAttention(config, name="attention")
+ self.is_decoder = config.is_decoder
+ self.add_cross_attention = config.add_cross_attention
+ if self.add_cross_attention:
+ if not self.is_decoder:
+ raise ValueError(f"{self} should be used as a decoder model if cross attention is added")
+ self.crossattention = TFTapasAttention(config, name="crossattention")
+ self.intermediate = TFTapasIntermediate(config, name="intermediate")
+ self.bert_output = TFTapasOutput(config, name="output")
+
+ def call(
+ self,
+ hidden_states: tf.Tensor,
+ attention_mask: tf.Tensor,
+ head_mask: tf.Tensor,
+ encoder_hidden_states: Optional[tf.Tensor],
+ encoder_attention_mask: Optional[tf.Tensor],
+ past_key_value: Optional[Tuple[tf.Tensor]],
+ output_attentions: bool,
+ training: bool = False,
+ ) -> Tuple[tf.Tensor]:
+ # decoder uni-directional self-attention cached key/values tuple is at positions 1,2
+ self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
+ self_attention_outputs = self.attention(
+ input_tensor=hidden_states,
+ attention_mask=attention_mask,
+ head_mask=head_mask,
+ encoder_hidden_states=None,
+ encoder_attention_mask=None,
+ past_key_value=self_attn_past_key_value,
+ output_attentions=output_attentions,
+ training=training,
+ )
+ attention_output = self_attention_outputs[0]
+
+ # if decoder, the last output is tuple of self-attn cache
+ if self.is_decoder:
+ outputs = self_attention_outputs[1:-1]
+ present_key_value = self_attention_outputs[-1]
+ else:
+ outputs = self_attention_outputs[1:] # add self attentions if we output attention weights
+
+ cross_attn_present_key_value = None
+ if self.is_decoder and encoder_hidden_states is not None:
+ if not hasattr(self, "crossattention"):
+ raise ValueError(
+ f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers "
+ "by setting `config.add_cross_attention=True`"
+ )
+
+ # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
+ cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
+ cross_attention_outputs = self.crossattention(
+ input_tensor=attention_output,
+ attention_mask=attention_mask,
+ head_mask=head_mask,
+ encoder_hidden_states=encoder_hidden_states,
+ encoder_attention_mask=encoder_attention_mask,
+ past_key_value=cross_attn_past_key_value,
+ output_attentions=output_attentions,
+ training=training,
+ )
+ attention_output = cross_attention_outputs[0]
+ outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights
+
+ # add cross-attn cache to positions 3,4 of present_key_value tuple
+ cross_attn_present_key_value = cross_attention_outputs[-1]
+ present_key_value = present_key_value + cross_attn_present_key_value
+
+ intermediate_output = self.intermediate(hidden_states=attention_output)
+ layer_output = self.bert_output(
+ hidden_states=intermediate_output, input_tensor=attention_output, training=training
+ )
+ outputs = (layer_output,) + outputs # add attentions if we output them
+
+ # if decoder, return the attn key/values as the last output
+ if self.is_decoder:
+ outputs = outputs + (present_key_value,)
+
+ return outputs
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Tapas
+class TFTapasEncoder(tf.keras.layers.Layer):
+ def __init__(self, config: TapasConfig, **kwargs):
+ super().__init__(**kwargs)
+ self.config = config
+ self.layer = [TFTapasLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)]
+
+ def call(
+ self,
+ hidden_states: tf.Tensor,
+ attention_mask: tf.Tensor,
+ head_mask: tf.Tensor,
+ encoder_hidden_states: Optional[tf.Tensor],
+ encoder_attention_mask: Optional[tf.Tensor],
+ past_key_values: Optional[Tuple[Tuple[tf.Tensor]]],
+ use_cache: Optional[bool],
+ output_attentions: bool,
+ output_hidden_states: bool,
+ return_dict: bool,
+ training: bool = False,
+ ) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]:
+ all_hidden_states = () if output_hidden_states else None
+ all_attentions = () if output_attentions else None
+ all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None
+
+ next_decoder_cache = () if use_cache else None
+ for i, layer_module in enumerate(self.layer):
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ past_key_value = past_key_values[i] if past_key_values is not None else None
+
+ layer_outputs = layer_module(
+ hidden_states=hidden_states,
+ attention_mask=attention_mask,
+ head_mask=head_mask[i],
+ encoder_hidden_states=encoder_hidden_states,
+ encoder_attention_mask=encoder_attention_mask,
+ past_key_value=past_key_value,
+ output_attentions=output_attentions,
+ training=training,
+ )
+ hidden_states = layer_outputs[0]
+
+ if use_cache:
+ next_decoder_cache += (layer_outputs[-1],)
+
+ if output_attentions:
+ all_attentions = all_attentions + (layer_outputs[1],)
+ if self.config.add_cross_attention and encoder_hidden_states is not None:
+ all_cross_attentions = all_cross_attentions + (layer_outputs[2],)
+
+ # Add last layer
+ if output_hidden_states:
+ all_hidden_states = all_hidden_states + (hidden_states,)
+
+ if not return_dict:
+ return tuple(
+ v for v in [hidden_states, all_hidden_states, all_attentions, all_cross_attentions] if v is not None
+ )
+
+ return TFBaseModelOutputWithPastAndCrossAttentions(
+ last_hidden_state=hidden_states,
+ past_key_values=next_decoder_cache,
+ hidden_states=all_hidden_states,
+ attentions=all_attentions,
+ cross_attentions=all_cross_attentions,
+ )
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Tapas
+class TFTapasPooler(tf.keras.layers.Layer):
+ def __init__(self, config: TapasConfig, **kwargs):
+ super().__init__(**kwargs)
+
+ self.dense = tf.keras.layers.Dense(
+ units=config.hidden_size,
+ kernel_initializer=get_initializer(config.initializer_range),
+ activation="tanh",
+ name="dense",
+ )
+
+ def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+ # We "pool" the model by simply taking the hidden state corresponding
+ # to the first token.
+ first_token_tensor = hidden_states[:, 0]
+ pooled_output = self.dense(inputs=first_token_tensor)
+
+ return pooled_output
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->Tapas
+class TFTapasPredictionHeadTransform(tf.keras.layers.Layer):
+ def __init__(self, config: TapasConfig, **kwargs):
+ super().__init__(**kwargs)
+
+ self.dense = tf.keras.layers.Dense(
+ units=config.hidden_size,
+ kernel_initializer=get_initializer(config.initializer_range),
+ name="dense",
+ )
+
+ if isinstance(config.hidden_act, str):
+ self.transform_act_fn = get_tf_activation(config.hidden_act)
+ else:
+ self.transform_act_fn = config.hidden_act
+
+ self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm")
+
+ def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+ hidden_states = self.dense(inputs=hidden_states)
+ hidden_states = self.transform_act_fn(hidden_states)
+ hidden_states = self.LayerNorm(inputs=hidden_states)
+
+ return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertLMPredictionHead with Bert->Tapas
+class TFTapasLMPredictionHead(tf.keras.layers.Layer):
+ def __init__(self, config: TapasConfig, input_embeddings: tf.keras.layers.Layer, **kwargs):
+ super().__init__(**kwargs)
+
+ self.vocab_size = config.vocab_size
+ self.hidden_size = config.hidden_size
+
+ self.transform = TFTapasPredictionHeadTransform(config, name="transform")
+
+ # The output weights are the same as the input embeddings, but there is
+ # an output-only bias for each token.
+ self.input_embeddings = input_embeddings
+
+ def build(self, input_shape: tf.TensorShape):
+ self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias")
+
+ super().build(input_shape)
+
+ def get_output_embeddings(self) -> tf.keras.layers.Layer:
+ return self.input_embeddings
+
+ def set_output_embeddings(self, value: tf.Variable):
+ self.input_embeddings.weight = value
+ self.input_embeddings.vocab_size = shape_list(value)[0]
+
+ def get_bias(self) -> Dict[str, tf.Variable]:
+ return {"bias": self.bias}
+
+ def set_bias(self, value: tf.Variable):
+ self.bias = value["bias"]
+ self.vocab_size = shape_list(value["bias"])[0]
+
+ def call(self, hidden_states: tf.Tensor) -> tf.Tensor:
+ hidden_states = self.transform(hidden_states=hidden_states)
+ seq_length = shape_list(hidden_states)[1]
+ hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size])
+ hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True)
+ hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size])
+ hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias)
+
+ return hidden_states
+
+
+# Copied from transformers.models.bert.modeling_tf_bert.TFBertMLMHead with Bert->Tapas
+class TFTapasMLMHead(tf.keras.layers.Layer):
+ def __init__(self, config: TapasConfig, input_embeddings: tf.keras.layers.Layer, **kwargs):
+ super().__init__(**kwargs)
+
+ self.predictions = TFTapasLMPredictionHead(config, input_embeddings, name="predictions")
+
+ def call(self, sequence_output: tf.Tensor) -> tf.Tensor:
+ prediction_scores = self.predictions(hidden_states=sequence_output)
+
+ return prediction_scores
+
+
+@keras_serializable
+class TFTapasMainLayer(tf.keras.layers.Layer):
+ config_class = TapasConfig
+
+ def __init__(self, config: TapasConfig, add_pooling_layer: bool = True, **kwargs):
+ requires_backends(self, "tensorflow_probability")
+ super().__init__(**kwargs)
+
+ self.config = config
+
+ self.embeddings = TFTapasEmbeddings(config, name="embeddings")
+ self.encoder = TFTapasEncoder(config, name="encoder")
+ self.pooler = TFTapasPooler(config, name="pooler") if add_pooling_layer else None
+
+ def get_input_embeddings(self) -> tf.keras.layers.Layer:
+ return self.embeddings
+
+ def set_input_embeddings(self, value: tf.Variable):
+ self.embeddings.weight = value
+ self.embeddings.vocab_size = shape_list(value)[0]
+
+ def _prune_heads(self, heads_to_prune):
+ """
+ Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base
+ class PreTrainedModel
+ """
+ raise NotImplementedError
+
+ def call(
+ self,
+ input_ids: Optional[TFModelInputType] = None,
+ attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ training: bool = False,
+ **kwargs,
+ ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
+ inputs = input_processing(
+ func=self.call,
+ config=self.config,
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ head_mask=head_mask,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ kwargs_call=kwargs,
+ )
+
+ if inputs["input_ids"] is not None and inputs["inputs_embeds"] is not None:
+ raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time")
+ elif inputs["input_ids"] is not None:
+ input_shape = shape_list(inputs["input_ids"])
+ elif inputs["inputs_embeds"] is not None:
+ input_shape = shape_list(inputs["inputs_embeds"])[:-1]
+ else:
+ raise ValueError("You have to specify either input_ids or inputs_embeds")
+
+ if inputs["attention_mask"] is None:
+ inputs["attention_mask"] = tf.fill(dims=input_shape, value=1)
+
+ if inputs["token_type_ids"] is None:
+ inputs["token_type_ids"] = tf.fill(dims=input_shape + [len(self.config.type_vocab_sizes)], value=0)
+
+ embedding_output = self.embeddings(
+ input_ids=inputs["input_ids"],
+ position_ids=inputs["position_ids"],
+ token_type_ids=inputs["token_type_ids"],
+ inputs_embeds=inputs["inputs_embeds"],
+ training=inputs["training"],
+ )
+
+ # We create a 3D attention mask from a 2D tensor mask.
+ # Sizes are [batch_size, 1, 1, to_seq_length]
+ # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length]
+ # this attention mask is more simple than the triangular masking of causal attention
+ # used in OpenAI GPT, we just need to prepare the broadcast dimension here.
+ extended_attention_mask = tf.reshape(inputs["attention_mask"], (input_shape[0], 1, 1, input_shape[1]))
+
+ # Since attention_mask is 1.0 for positions we want to attend and 0.0 for
+ # masked positions, this operation will create a tensor which is 0.0 for
+ # positions we want to attend and -10000.0 for masked positions.
+ # Since we are adding it to the raw scores before the softmax, this is
+ # effectively the same as removing these entirely.
+ extended_attention_mask = tf.cast(extended_attention_mask, dtype=embedding_output.dtype)
+ one_cst = tf.constant(1.0, dtype=embedding_output.dtype)
+ ten_thousand_cst = tf.constant(-10000.0, dtype=embedding_output.dtype)
+ extended_attention_mask = tf.multiply(tf.subtract(one_cst, extended_attention_mask), ten_thousand_cst)
+
+ # Prepare head mask if needed
+ # 1.0 in head_mask indicate we keep the head
+ # attention_probs has shape bsz x n_heads x N x N
+ # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads]
+ # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length]
+ if inputs["head_mask"] is not None:
+ raise NotImplementedError
+ else:
+ inputs["head_mask"] = [None] * self.config.num_hidden_layers
+
+ encoder_outputs = self.encoder(
+ hidden_states=embedding_output,
+ attention_mask=extended_attention_mask,
+ head_mask=inputs["head_mask"],
+ encoder_hidden_states=None,
+ encoder_attention_mask=None,
+ past_key_values=None,
+ use_cache=None,
+ output_attentions=inputs["output_attentions"],
+ output_hidden_states=inputs["output_hidden_states"],
+ return_dict=inputs["return_dict"],
+ training=inputs["training"],
+ )
+
+ sequence_output = encoder_outputs[0]
+ pooled_output = self.pooler(hidden_states=sequence_output) if self.pooler is not None else None
+
+ if not inputs["return_dict"]:
+ return (
+ sequence_output,
+ pooled_output,
+ ) + encoder_outputs[1:]
+
+ return TFBaseModelOutputWithPooling(
+ last_hidden_state=sequence_output,
+ pooler_output=pooled_output,
+ hidden_states=encoder_outputs.hidden_states,
+ attentions=encoder_outputs.attentions,
+ )
+
+
+class TFTapasPreTrainedModel(TFPreTrainedModel):
+ """
+ An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+ models.
+ """
+
+ config_class = TapasConfig
+ base_model_prefix = "tapas"
+
+
+TAPAS_START_DOCSTRING = r"""
+
+ This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the
+ generic methods the library implements for all its model (such as downloading or saving, resizing the input
+ embeddings, pruning heads etc.)
+
+ This model is also a `tf.keras.Model `__ subclass. Use
+ it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage
+ and behavior.
+
+ .. note::
+
+ TF 2.0 models accepts two formats as inputs:
+
+ - having all inputs as keyword arguments (like PyTorch models), or
+ - having all inputs as a list, tuple or dict in the first positional arguments.
+
+ This second option is useful when using :meth:`tf.keras.Model.fit` method which currently requires having all
+ the tensors in the first argument of the model call function: :obj:`model(inputs)`.
+
+ If you choose this second option, there are three possibilities you can use to gather all the input Tensors in
+ the first positional argument :
+
+ - a single Tensor with :obj:`input_ids` only and nothing else: :obj:`model(inputs_ids)`
+ - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring:
+ :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])`
+ - a dictionary with one or several input Tensors associated to the input names given in the docstring:
+ :obj:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})`
+
+ Parameters:
+ config (:class:`~transformers.TapasConfig`): Model configuration class with all the parameters of the model.
+ Initializing with a config file does not load the weights associated with the model, only the
+ configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model
+ weights.
+"""
+
+TAPAS_INPUTS_DOCSTRING = r"""
+ Args:
+ input_ids (:obj:`np.ndarray`, :obj:`tf.Tensor`, :obj:`List[tf.Tensor]` :obj:`Dict[str, tf.Tensor]` or :obj:`Dict[str, np.ndarray]` and each example must have the shape :obj:`({0})`):
+ Indices of input sequence tokens in the vocabulary.
+
+ Indices can be obtained using :class:`~transformers.TapasTokenizer`. See
+ :func:`transformers.PreTrainedTokenizer.__call__` and :func:`transformers.PreTrainedTokenizer.encode` for
+ details.
+
+ `What are input IDs? <../glossary.html#input-ids>`__
+ attention_mask (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
+ Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``:
+
+ - 1 for tokens that are **not masked**,
+ - 0 for tokens that are **masked**.
+
+ `What are attention masks? <../glossary.html#attention-mask>`__
+ token_type_ids (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0}, 7)`, `optional`):
+ Token indices that encode tabular structure. Indices can be obtained using
+ :class:`~transformers.TapasTokenizer`. See this class for more info.
+
+ `What are token type IDs? <../glossary.html#token-type-ids>`__
+ position_ids (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`):
+ Indices of positions of each input sequence tokens in the position embeddings. If
+ ``reset_position_index_per_cell`` of :class:`~transformers.TapasConfig` is set to ``True``, relative
+ position embeddings will be used. Selected in the range ``[0, config.max_position_embeddings - 1]``.
+
+ `What are position IDs? <../glossary.html#position-ids>`__
+ head_mask (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`):
+ Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``:
+
+ - 1 indicates the head is **not masked**,
+ - 0 indicates the head is **masked**.
+
+ inputs_embeds (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0}, hidden_size)`, `optional`):
+ Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation.
+ This is useful if you want more control over how to convert :obj:`input_ids` indices into associated
+ vectors than the model's internal embedding lookup matrix.
+ output_attentions (:obj:`bool`, `optional`):
+ Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned
+ tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the
+ config will be used instead.
+ output_hidden_states (:obj:`bool`, `optional`):
+ Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for
+ more detail. This argument can be used only in eager mode, in graph mode the value in the config will be
+ used instead.
+ return_dict (:obj:`bool`, `optional`):
+ Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This
+ argument can be used in eager mode, in graph mode the value will always be set to True.
+ training (:obj:`bool`, `optional`, defaults to :obj:`False`):
+ Whether or not to use the model in training mode (some modules like dropout modules have different
+ behaviors between training and evaluation).
+"""
+
+
+@add_start_docstrings(
+ "The bare Tapas Model transformer outputting raw hidden-states without any specific head on top.",
+ TAPAS_START_DOCSTRING,
+)
+class TFTapasModel(TFTapasPreTrainedModel):
+ def __init__(self, config: TapasConfig, *inputs, **kwargs):
+ super().__init__(config, *inputs, **kwargs)
+
+ self.tapas = TFTapasMainLayer(config, name="tapas")
+
+ @add_start_docstrings_to_model_forward(TAPAS_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+ @add_code_sample_docstrings(
+ processor_class=_TOKENIZER_FOR_DOC,
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=TFBaseModelOutputWithPooling,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ def call(
+ self,
+ input_ids: Optional[TFModelInputType] = None,
+ attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ training: Optional[bool] = False,
+ **kwargs,
+ ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]:
+ r"""
+ Returns:
+
+ Examples::
+
+ >>> from transformers import TapasTokenizer, TapasModel
+ >>> import pandas as pd
+
+ >>> tokenizer = TapasTokenizer.from_pretrained('google/tapas-base')
+ >>> model = TapasModel.from_pretrained('google/tapas-base')
+
+ >>> data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
+ ... 'Age': ["56", "45", "59"],
+ ... 'Number of movies': ["87", "53", "69"]
+ ... }
+ >>> table = pd.DataFrame.from_dict(data)
+ >>> queries = ["How many movies has George Clooney played in?", "How old is Brad Pitt?"]
+
+ >>> inputs = tokenizer(table=table, queries=queries, padding="max_length", return_tensors="tf")
+ >>> outputs = model(**inputs)
+
+ >>> last_hidden_states = outputs.last_hidden_state
+ """
+ inputs = input_processing(
+ func=self.call,
+ config=self.config,
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ head_mask=head_mask,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ training=training,
+ kwargs_call=kwargs,
+ )
+ outputs = self.tapas(
+ input_ids=inputs["input_ids"],
+ attention_mask=inputs["attention_mask"],
+ token_type_ids=inputs["token_type_ids"],
+ position_ids=inputs["position_ids"],
+ head_mask=inputs["head_mask"],
+ inputs_embeds=inputs["inputs_embeds"],
+ output_attentions=inputs["output_attentions"],
+ output_hidden_states=inputs["output_hidden_states"],
+ return_dict=inputs["return_dict"],
+ training=inputs["training"],
+ )
+
+ return outputs
+
+ def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling:
+ hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
+ attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
+
+ return TFBaseModelOutputWithPooling(
+ last_hidden_state=output.last_hidden_state,
+ pooler_output=output.pooler_output,
+ hidden_states=hs,
+ attentions=attns,
+ )
+
+
+@add_start_docstrings("""Tapas Model with a `language modeling` head on top. """, TAPAS_START_DOCSTRING)
+class TFTapasForMaskedLM(TFTapasPreTrainedModel, TFMaskedLanguageModelingLoss):
+ def __init__(self, config: TapasConfig, *inputs, **kwargs):
+ super().__init__(config, *inputs, **kwargs)
+
+ if config.is_decoder:
+ logger.warning(
+ "If you want to use `TFTapasForMaskedLM` make sure `config.is_decoder=False` for "
+ "bi-directional self-attention."
+ )
+
+ self.tapas = TFTapasMainLayer(config, add_pooling_layer=False, name="tapas")
+ self.lm_head = TFTapasMLMHead(config, input_embeddings=self.tapas.embeddings, name="cls")
+
+ def get_lm_head(self) -> tf.keras.layers.Layer:
+ return self.lm_head.predictions
+
+ @add_start_docstrings_to_model_forward(TAPAS_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+ @add_code_sample_docstrings(
+ processor_class=_TOKENIZER_FOR_DOC,
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=TFMaskedLMOutput,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ def call(
+ self,
+ input_ids: Optional[TFModelInputType] = None,
+ attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ training: Optional[bool] = False,
+ **kwargs,
+ ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]:
+ r"""
+ labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`):
+ Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ...,
+ config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored
+ (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]``
+
+ Returns:
+
+ Examples::
+
+ >>> from transformers import TapasTokenizer, TapasForMaskedLM
+ >>> import pandas as pd
+
+ >>> tokenizer = TapasTokenizer.from_pretrained('google/tapas-base')
+ >>> model = TapasForMaskedLM.from_pretrained('google/tapas-base')
+
+ >>> data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
+ ... 'Age': ["56", "45", "59"],
+ ... 'Number of movies': ["87", "53", "69"]
+ ... }
+ >>> table = pd.DataFrame.from_dict(data)
+
+ >>> inputs = tokenizer(table=table, queries="How many [MASK] has George [MASK] played in?", return_tensors="tf")
+ >>> labels = tokenizer(table=table, queries="How many movies has George Clooney played in?", return_tensors="tf")["input_ids"]
+
+ >>> outputs = model(**inputs, labels=labels)
+ >>> last_hidden_states = outputs.last_hidden_state
+ """
+ inputs = input_processing(
+ func=self.call,
+ config=self.config,
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ head_mask=head_mask,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ labels=labels,
+ training=training,
+ kwargs_call=kwargs,
+ )
+ outputs = self.tapas(
+ input_ids=inputs["input_ids"],
+ attention_mask=inputs["attention_mask"],
+ token_type_ids=inputs["token_type_ids"],
+ position_ids=inputs["position_ids"],
+ head_mask=inputs["head_mask"],
+ inputs_embeds=inputs["inputs_embeds"],
+ output_attentions=inputs["output_attentions"],
+ output_hidden_states=inputs["output_hidden_states"],
+ return_dict=inputs["return_dict"],
+ training=inputs["training"],
+ )
+ sequence_output = outputs[0]
+ prediction_scores = self.lm_head(sequence_output)
+ loss = (
+ None if inputs["labels"] is None else self.compute_loss(labels=inputs["labels"], logits=prediction_scores)
+ )
+
+ if not inputs["return_dict"]:
+ output = (prediction_scores,) + outputs[2:]
+ return ((loss,) + output) if loss is not None else output
+
+ return TFMaskedLMOutput(
+ loss=loss,
+ logits=prediction_scores,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+ def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput:
+ hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
+ attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
+
+ return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns)
+
+
+class TFTapasComputeTokenLogits(tf.keras.layers.Layer):
+ def __init__(self, config: TapasConfig, **kwargs):
+ super().__init__(**kwargs)
+
+ self.temperature = config.temperature
+ # cell selection heads
+ with tf.name_scope("output"):
+ self.output_weights = self.add_weight(
+ name="output_weights",
+ shape=(config.hidden_size,),
+ dtype=tf.float32,
+ trainable=True,
+ initializer=tf.zeros_initializer()
+ if config.init_cell_selection_weights_to_zero
+ else tf.keras.initializers.TruncatedNormal(stddev=config.initializer_range),
+ )
+ self.output_bias = self.add_weight(
+ name="output_bias", shape=(), trainable=True, initializer=tf.zeros_initializer()
+ )
+
+ def call(self, sequence_output: tf.Tensor) -> tf.Tensor:
+ """
+ Computes logits per token
+
+ Args:
+ sequence_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ Also known as last_hidden_state. Sequence of hidden-states at the output of the last layer of the
+ model.
+
+ Returns:
+ logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`): Logits per token.
+ """
+ logits = (tf.einsum("bsj,j->bs", sequence_output, self.output_weights) + self.output_bias) / self.temperature
+ return logits
+
+
+class TFTapasComputeColumnLogits(tf.keras.layers.Layer):
+ def __init__(self, config: TapasConfig, **kwargs):
+ super().__init__(**kwargs)
+
+ with tf.name_scope("column_output"):
+ self.column_output_weights = self.add_weight(
+ name="column_output_weights",
+ shape=[config.hidden_size],
+ dtype=tf.float32,
+ trainable=True,
+ initializer=tf.zeros_initializer()
+ if config.init_cell_selection_weights_to_zero
+ else tf.keras.initializers.TruncatedNormal(stddev=config.initializer_range),
+ )
+ self.column_output_bias = self.add_weight(
+ name="column_output_bias", shape=(), trainable=True, initializer=tf.zeros_initializer()
+ )
+
+ def call(self, sequence_output, cell_index, cell_mask, allow_empty_column_selection) -> tf.Tensor:
+ """
+ Computes the column logits.
+
+ Args:
+ sequence_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`):
+ Also known as last_hidden_state. Sequence of hidden-states at the output of the last layer of the
+ model.
+ cell_index (:obj:`ProductIndexMap`):
+ Index that groups tokens into cells.
+ cell_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, max_num_rows * max_num_cols)`):
+ Mask for cells that exist in the table (i.e. that are not padding).
+ allow_empty_column_selection (:obj:`bool`):
+ Whether to allow not to select any column
+
+ Returns:
+ column_logits (:obj:`tf.Tensor`of shape :obj:`(batch_size, max_num_cols)`): Tensor containing the column
+ logits for every example in the batch.
+ """
+
+ # First, compute the token logits (batch_size, seq_len) - without temperature
+ token_logits = tf.einsum("bsj,j->bs", sequence_output, self.column_output_weights) + self.column_output_bias
+
+ # Next, average the logits per cell (batch_size, max_num_cols*max_num_rows)
+ cell_logits, cell_logits_index = reduce_mean(token_logits, cell_index)
+
+ # Finally, average the logits per column (batch_size, max_num_cols)
+ column_index = cell_index.project_inner(cell_logits_index)
+ column_logits, out_index = reduce_sum(cell_logits * cell_mask, column_index)
+
+ cell_count, _ = reduce_sum(cell_mask, column_index)
+ column_logits /= cell_count + EPSILON_ZERO_DIVISION
+
+ # Mask columns that do not appear in the example.
+ is_padding = tf.logical_and(cell_count < 0.5, tf.not_equal(out_index.indices, 0))
+ column_logits += CLOSE_ENOUGH_TO_LOG_ZERO * tf.cast(is_padding, tf.float32)
+
+ if not allow_empty_column_selection:
+ column_logits += CLOSE_ENOUGH_TO_LOG_ZERO * tf.cast(tf.equal(out_index.indices, 0), tf.float32)
+
+ return column_logits
+
+
+@add_start_docstrings(
+ """
+ Tapas Model with a cell selection head and optional aggregation head on top for question-answering tasks on tables
+ (linear layers on top of the hidden-states output to compute `logits` and optional `logits_aggregation`), e.g. for
+ SQA, WTQ or WikiSQL-supervised tasks.
+ """,
+ TAPAS_START_DOCSTRING,
+)
+class TFTapasForQuestionAnswering(TFTapasPreTrainedModel):
+ def __init__(self, config: TapasConfig, *inputs, **kwargs):
+ super().__init__(config, *inputs, **kwargs)
+
+ # base model
+ self.tapas = TFTapasMainLayer(config, name="tapas")
+
+ # dropout
+ self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob)
+
+ self.compute_token_logits = TFTapasComputeTokenLogits(config, name="compute_token_logits")
+
+ self.compute_column_logits = TFTapasComputeColumnLogits(config, name="compute_column_logits")
+
+ if config.num_aggregation_labels > 0:
+ self.aggregation_classifier = tf.keras.layers.Dense(
+ config.num_aggregation_labels,
+ kernel_initializer=get_initializer(config.initializer_range),
+ name="aggregation_classifier",
+ )
+ self.config = config
+
+ @add_start_docstrings_to_model_forward(TAPAS_INPUTS_DOCSTRING.format("batch_size, sequence_length"))
+ @add_code_sample_docstrings(
+ processor_class=_TOKENIZER_FOR_DOC,
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=TFTableQuestionAnsweringOutput,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ def call(
+ self,
+ input_ids: Optional[TFModelInputType] = None,
+ attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ table_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ aggregation_labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ float_answer: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ numeric_values: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ numeric_values_scale: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ training: Optional[bool] = False,
+ **kwargs,
+ ) -> Union[TFTableQuestionAnsweringOutput, Tuple[tf.Tensor]]:
+ r"""
+ table_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, seq_length)`, `optional`):
+ Mask for the table. Indicates which tokens belong to the table (1). Question tokens, table headers and
+ padding are 0.
+ labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, seq_length)`, `optional`):
+ Labels per token for computing the hierarchical cell selection loss. This encodes the positions of the
+ answer appearing in the table. Can be obtained using :class:`~transformers.TapasTokenizer`.
+
+ - 1 for tokens that are **part of the answer**,
+ - 0 for tokens that are **not part of the answer**.
+
+ aggregation_labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, )`, `optional`):
+ Aggregation function index for every example in the batch for computing the aggregation loss. Indices
+ should be in :obj:`[0, ..., config.num_aggregation_labels - 1]`. Only required in case of strong
+ supervision for aggregation (WikiSQL-supervised).
+ float_answer (:obj:`tf.Tensor` of shape :obj:`(batch_size, )`, `optional`):
+ Float answer for every example in the batch. Set to `float('nan')` for cell selection questions. Only
+ required in case of weak supervision (WTQ) to calculate the aggregate mask and regression loss.
+ numeric_values (:obj:`tf.Tensor` of shape :obj:`(batch_size, seq_length)`, `optional`):
+ Numeric values of every token, NaN for tokens which are not numeric values. Can be obtained using
+ :class:`~transformers.TapasTokenizer`. Only required in case of weak supervision for aggregation (WTQ) to
+ calculate the regression loss.
+ numeric_values_scale (:obj:`tf.Tensor` of shape :obj:`(batch_size, seq_length)`, `optional`):
+ Scale of the numeric values of every token. Can be obtained using :class:`~transformers.TapasTokenizer`.
+ Only required in case of weak supervision for aggregation (WTQ) to calculate the regression loss.
+
+ Returns:
+
+ Examples::
+
+ >>> from transformers import TapasTokenizer, TapasForQuestionAnswering
+ >>> import pandas as pd
+
+ >>> tokenizer = TapasTokenizer.from_pretrained('google/tapas-base-finetuned-wtq')
+ >>> model = TapasForQuestionAnswering.from_pretrained('google/tapas-base-finetuned-wtq')
+
+ >>> data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
+ ... 'Age': ["56", "45", "59"],
+ ... 'Number of movies': ["87", "53", "69"]
+ ... }
+ >>> table = pd.DataFrame.from_dict(data)
+ >>> queries = ["How many movies has George Clooney played in?", "How old is Brad Pitt?"]
+
+ >>> inputs = tokenizer(table=table, queries=queries, padding="max_length", return_tensors="tf")
+ >>> outputs = model(**inputs)
+
+ >>> logits = outputs.logits
+ >>> logits_aggregation = outputs.logits_aggregation
+ """
+
+ inputs = input_processing(
+ func=self.call,
+ config=self.config,
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ head_mask=head_mask,
+ inputs_embeds=inputs_embeds,
+ table_mask=table_mask,
+ aggregation_labels=aggregation_labels,
+ float_answer=float_answer,
+ numeric_values=numeric_values,
+ numeric_values_scale=numeric_values_scale,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ labels=labels,
+ training=training,
+ kwargs_call=kwargs,
+ )
+ outputs = self.tapas(
+ input_ids=inputs["input_ids"],
+ attention_mask=inputs["attention_mask"],
+ token_type_ids=inputs["token_type_ids"],
+ position_ids=inputs["position_ids"],
+ head_mask=inputs["head_mask"],
+ inputs_embeds=inputs["inputs_embeds"],
+ output_attentions=inputs["output_attentions"],
+ output_hidden_states=inputs["output_hidden_states"],
+ return_dict=inputs["return_dict"],
+ training=inputs["training"],
+ )
+
+ sequence_output = outputs[0]
+ pooled_output = outputs[1]
+
+ sequence_output = self.dropout(sequence_output)
+
+ if inputs["input_ids"] is not None:
+ input_shape = shape_list(inputs["input_ids"])
+ else:
+ input_shape = shape_list(inputs["inputs_embeds"])[:-1]
+
+ # Construct indices for the table.
+ if inputs["token_type_ids"] is None:
+ inputs["token_type_ids"] = tf.fill(input_shape + [len(self.config.type_vocab_sizes)], 0)
+
+ token_types = [
+ "segment_ids",
+ "column_ids",
+ "row_ids",
+ "prev_labels",
+ "column_ranks",
+ "inv_column_ranks",
+ "numeric_relations",
+ ]
+
+ row_ids = inputs["token_type_ids"][:, :, token_types.index("row_ids")]
+ column_ids = inputs["token_type_ids"][:, :, token_types.index("column_ids")]
+
+ # Construct indices for the table.
+ row_index = IndexMap(
+ indices=tf.minimum(tf.cast(row_ids, tf.int32), self.config.max_num_rows - 1),
+ num_segments=self.config.max_num_rows,
+ batch_dims=1,
+ )
+ col_index = IndexMap(
+ indices=tf.minimum(tf.cast(column_ids, tf.int32), self.config.max_num_columns - 1),
+ num_segments=self.config.max_num_columns,
+ batch_dims=1,
+ )
+ cell_index = ProductIndexMap(row_index, col_index)
+
+ # Masks.
+ input_shape = (
+ shape_list(inputs["input_ids"])
+ if inputs["input_ids"] is not None
+ else shape_list(inputs["inputs_embeds"])[:-1]
+ )
+ if inputs["attention_mask"] is None:
+ inputs["attention_mask"] = tf.ones(input_shape)
+ # Table cells only, without question tokens and table headers.
+ if inputs["table_mask"] is None:
+ inputs["table_mask"] = tf.where(row_ids > 0, tf.ones_like(row_ids), tf.zeros_like(row_ids))
+ # [batch_size, seq_length]
+ input_mask_float = tf.cast(inputs["attention_mask"], tf.float32)
+ table_mask_float = tf.cast(inputs["table_mask"], tf.float32)
+
+ # Mask for cells that exist in the table (i.e. that are not padding).
+ cell_mask, _ = reduce_mean(input_mask_float, cell_index)
+
+ # Compute logits per token. These are used to select individual cells.
+ logits = self.compute_token_logits(sequence_output)
+
+ # Compute logits per column. These are used to select a column.
+ column_logits = None
+ if self.config.select_one_column:
+ column_logits = self.compute_column_logits(
+ sequence_output, cell_index, cell_mask, self.config.allow_empty_column_selection
+ )
+
+ # Aggregate logits.
+ logits_aggregation = None
+ if self.config.num_aggregation_labels > 0:
+ logits_aggregation = self.aggregation_classifier(pooled_output)
+
+ # Total loss calculation
+ total_loss = 0.0
+ calculate_loss = False
+ if inputs["labels"] is not None:
+ calculate_loss = True
+ is_supervised = not self.config.num_aggregation_labels > 0 or not self.config.use_answer_as_supervision
+
+ # Semi-supervised cell selection in case of no aggregation:
+ # If the answer (the denotation) appears directly in the table we might
+ # select the answer without applying any aggregation function. There are
+ # some ambiguous cases, see utils._calculate_aggregate_mask for more info.
+ # `aggregate_mask` is 1 for examples where we chose to aggregate and 0
+ # for examples where we chose to select the answer directly.
+ # `labels` encodes the positions of the answer appearing in the table.
+ if is_supervised:
+ aggregate_mask = None
+ else:
+ if inputs["float_answer"] is not None:
+ assert (
+ shape_list(inputs["labels"])[0] == shape_list(inputs["float_answer"])[0]
+ ), "Make sure the answers are a FloatTensor of shape (batch_size,)"
+ # [batch_size]
+ aggregate_mask = _calculate_aggregate_mask(
+ inputs["float_answer"],
+ pooled_output,
+ self.config.cell_selection_preference,
+ inputs["labels"],
+ self.aggregation_classifier,
+ )
+ else:
+ aggregate_mask = None
+ raise ValueError("You have to specify float answers in order to calculate the aggregate mask")
+
+ # Cell selection log-likelihood
+ if self.config.average_logits_per_cell:
+ logits_per_cell, _ = reduce_mean(logits, cell_index)
+ logits = gather(logits_per_cell, cell_index)
+ dist_per_token = tfp.distributions.Bernoulli(logits=logits)
+
+ # Compute cell selection loss per example.
+ selection_loss_per_example = None
+ if not self.config.select_one_column:
+ weight = tf.where(
+ inputs["labels"] == 0,
+ tf.ones_like(inputs["labels"], dtype=tf.float32),
+ self.config.positive_label_weight * tf.ones_like(inputs["labels"], dtype=tf.float32),
+ )
+ selection_loss_per_token = -dist_per_token.log_prob(inputs["labels"]) * weight
+ selection_loss_per_example = tf.reduce_sum(selection_loss_per_token * input_mask_float, axis=1) / (
+ tf.reduce_sum(input_mask_float, axis=1) + EPSILON_ZERO_DIVISION
+ )
+ else:
+ selection_loss_per_example, logits = _single_column_cell_selection_loss(
+ logits, column_logits, inputs["labels"], cell_index, col_index, cell_mask
+ )
+ dist_per_token = tfp.distributions.Bernoulli(logits=logits)
+
+ # Supervised cell selection
+ if self.config.disable_per_token_loss:
+ pass
+ elif is_supervised:
+ total_loss += tf.reduce_mean(selection_loss_per_example)
+ else:
+ # For the not supervised case, do not assign loss for cell selection
+ total_loss += tf.reduce_mean(selection_loss_per_example * (1.0 - aggregate_mask))
+
+ # Semi-supervised regression loss and supervised loss for aggregations
+ if self.config.num_aggregation_labels > 0:
+ if is_supervised:
+ # Note that `aggregate_mask` is None if the setting is supervised.
+ if inputs["aggregation_labels"] is not None:
+ assert (
+ shape_list(inputs["labels"])[0] == shape_list(inputs["aggregation_labels"])[0]
+ ), "Make sure the aggregation labels are a LongTensor of shape (batch_size,)"
+ per_example_additional_loss = _calculate_aggregation_loss(
+ logits_aggregation,
+ aggregate_mask,
+ inputs["aggregation_labels"],
+ self.config.use_answer_as_supervision,
+ self.config.num_aggregation_labels,
+ self.config.aggregation_loss_weight,
+ )
+ else:
+ raise ValueError(
+ "You have to specify aggregation labels in order to calculate the aggregation loss"
+ )
+ else:
+ aggregation_labels = tf.zeros(shape_list(inputs["labels"])[0], dtype=tf.int32)
+ per_example_additional_loss = _calculate_aggregation_loss(
+ logits_aggregation,
+ aggregate_mask,
+ aggregation_labels,
+ self.config.use_answer_as_supervision,
+ self.config.num_aggregation_labels,
+ self.config.aggregation_loss_weight,
+ )
+
+ if self.config.use_answer_as_supervision:
+ if inputs["numeric_values"] is not None and inputs["numeric_values_scale"] is not None:
+ assert shape_list(inputs["numeric_values"]) == shape_list(inputs["numeric_values_scale"])
+ # Add regression loss for numeric answers which require aggregation.
+ answer_loss, large_answer_loss_mask = _calculate_regression_loss(
+ inputs["float_answer"],
+ aggregate_mask,
+ dist_per_token,
+ inputs["numeric_values"],
+ inputs["numeric_values_scale"],
+ table_mask_float,
+ logits_aggregation,
+ self.config,
+ )
+ per_example_additional_loss += answer_loss
+ # Zero loss for examples with answer_loss > cutoff.
+ per_example_additional_loss *= large_answer_loss_mask
+ else:
+ raise ValueError(
+ "You have to specify numeric values and numeric values scale in order to calculate the regression loss"
+ )
+ total_loss += tf.reduce_mean(per_example_additional_loss)
+
+ else:
+ # if no label ids are provided, set them to zeros in order to properly compute logits
+ labels = tf.zeros_like(logits)
+ _, logits = _single_column_cell_selection_loss(
+ logits, column_logits, labels, cell_index, col_index, cell_mask
+ )
+ if not inputs["return_dict"]:
+ output = (logits, logits_aggregation) + outputs[2:]
+ return ((total_loss,) + output) if calculate_loss else output
+
+ return TFTableQuestionAnsweringOutput(
+ loss=total_loss if calculate_loss else None,
+ logits=logits,
+ logits_aggregation=logits_aggregation,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+ def serving_output(self, output: TFTableQuestionAnsweringOutput) -> TFTableQuestionAnsweringOutput:
+ hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
+ attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
+
+ return TFTableQuestionAnsweringOutput(
+ logits=output.logits, logits_aggregation=output.logits_aggregation, hidden_states=hs, attentions=attns
+ )
+
+
+@add_start_docstrings(
+ """
+ Tapas Model with a sequence classification head on top (a linear layer on top of the pooled output), e.g. for table
+ entailment tasks, such as TabFact (Chen et al., 2020).
+ """,
+ TAPAS_START_DOCSTRING,
+)
+class TFTapasForSequenceClassification(TFTapasPreTrainedModel, TFSequenceClassificationLoss):
+ def __init__(self, config: TapasConfig, *inputs, **kwargs):
+ super().__init__(config, *inputs, **kwargs)
+ self.num_labels = config.num_labels
+
+ self.tapas = TFTapasMainLayer(config, name="tapas")
+ self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob, name="dropout")
+ self.classifier = tf.keras.layers.Dense(
+ config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier"
+ )
+
+ @add_start_docstrings_to_model_forward(TAPAS_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length"))
+ @add_code_sample_docstrings(
+ processor_class=_TOKENIZER_FOR_DOC,
+ checkpoint=_CHECKPOINT_FOR_DOC,
+ output_type=TFSequenceClassifierOutput,
+ config_class=_CONFIG_FOR_DOC,
+ )
+ def call(
+ self,
+ input_ids: Optional[TFModelInputType] = None,
+ attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ output_attentions: Optional[bool] = None,
+ output_hidden_states: Optional[bool] = None,
+ return_dict: Optional[bool] = None,
+ labels: Optional[Union[np.ndarray, tf.Tensor]] = None,
+ training: Optional[bool] = False,
+ **kwargs,
+ ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]:
+ r"""
+ labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`):
+ Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ...,
+ config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss),
+ If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). Note: this is called
+ "classification_class_index" in the original implementation.
+
+ Returns:
+
+ Examples::
+
+ >>> from transformers import TapasTokenizer, TapasForSequenceClassification
+ >>> import tensorflow as tf
+ >>> import pandas as pd
+
+ >>> tokenizer = TapasTokenizer.from_pretrained('google/tapas-base-finetuned-tabfact')
+ >>> model = TapasForSequenceClassification.from_pretrained('google/tapas-base-finetuned-tabfact')
+
+ >>> data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
+ ... 'Age': ["56", "45", "59"],
+ ... 'Number of movies': ["87", "53", "69"]
+ ... }
+ >>> table = pd.DataFrame.from_dict(data)
+ >>> queries = ["There is only one actor who is 45 years old", "There are 3 actors which played in more than 60 movies"]
+
+ >>> inputs = tokenizer(table=table, queries=queries, padding="max_length", return_tensors="tf")
+ >>> labels = tf.convert_to_tensor([1, 0]) # 1 means entailed, 0 means refuted
+
+ >>> outputs = model(**inputs, labels=labels)
+ >>> loss = outputs.loss
+ >>> logits = outputs.logits
+ """
+
+ inputs = input_processing(
+ func=self.call,
+ config=self.config,
+ input_ids=input_ids,
+ attention_mask=attention_mask,
+ token_type_ids=token_type_ids,
+ position_ids=position_ids,
+ head_mask=head_mask,
+ inputs_embeds=inputs_embeds,
+ output_attentions=output_attentions,
+ output_hidden_states=output_hidden_states,
+ return_dict=return_dict,
+ labels=labels,
+ training=training,
+ kwargs_call=kwargs,
+ )
+ outputs = self.tapas(
+ input_ids=inputs["input_ids"],
+ attention_mask=inputs["attention_mask"],
+ token_type_ids=inputs["token_type_ids"],
+ position_ids=inputs["position_ids"],
+ head_mask=inputs["head_mask"],
+ inputs_embeds=inputs["inputs_embeds"],
+ output_attentions=inputs["output_attentions"],
+ output_hidden_states=inputs["output_hidden_states"],
+ return_dict=inputs["return_dict"],
+ training=inputs["training"],
+ )
+ pooled_output = outputs[1]
+ pooled_output = self.dropout(inputs=pooled_output, training=inputs["training"])
+ logits = self.classifier(inputs=pooled_output)
+ loss = None if inputs["labels"] is None else self.compute_loss(labels=inputs["labels"], logits=logits)
+
+ if not inputs["return_dict"]:
+ output = (logits,) + outputs[2:]
+ return ((loss,) + output) if loss is not None else output
+
+ return TFSequenceClassifierOutput(
+ loss=loss,
+ logits=logits,
+ hidden_states=outputs.hidden_states,
+ attentions=outputs.attentions,
+ )
+
+ def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput:
+ hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None
+ attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None
+
+ return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns)
+
+
+""" TAPAS utilities."""
+
+
+class AverageApproximationFunction(str, enum.Enum):
+ RATIO = "ratio"
+ FIRST_ORDER = "first_order"
+ SECOND_ORDER = "second_order"
+
+
+# Beginning of everything related to segmented tensors
+
+
+class IndexMap(object):
+ """Index grouping entries within a tensor."""
+
+ def __init__(self, indices, num_segments, batch_dims=0):
+ """
+ Creates an index.
+
+ Args:
+ indices: Tensor of indices, same shape as `values`.
+ num_segments: Scalar tensor, the number of segments. All elements
+ in a batched segmented tensor must have the same number of segments (although many segments can be empty).
+ batch_dims: Python integer, the number of batch dimensions. The first
+ `batch_dims` dimensions of a SegmentedTensor are treated as batch dimensions. Segments in different batch
+ elements are always distinct even if they have the same index.
+ """
+ self.indices = tf.convert_to_tensor(indices)
+ self.num_segments = tf.convert_to_tensor(num_segments)
+ self.batch_dims = batch_dims
+
+ def batch_shape(self):
+ return tf.shape(self.indices)[: self.batch_dims]
+
+
+class ProductIndexMap(IndexMap):
+ """The product of two indices."""
+
+ def __init__(self, outer_index, inner_index):
+ """
+ Combines indices i and j into pairs (i, j). The result is an index where each segment (i, j) is the
+ intersection of segments i and j. For example if the inputs represent table cells indexed by respectively rows
+ and columns the output will be a table indexed by (row, column) pairs, i.e. by cell. The implementation
+ combines indices {0, .., n - 1} and {0, .., m - 1} into {0, .., nm - 1}. The output has `num_segments` equal to
+ `outer_index.num_segements` * `inner_index.num_segments`.
+
+ Args:
+ outer_index: IndexMap.
+ inner_index: IndexMap, must have the same shape as `outer_index`.
+ """
+ if outer_index.batch_dims != inner_index.batch_dims:
+ raise ValueError("outer_index.batch_dims and inner_index.batch_dims " "must be the same.")
+
+ super(ProductIndexMap, self).__init__(
+ indices=(inner_index.indices + outer_index.indices * inner_index.num_segments),
+ num_segments=inner_index.num_segments * outer_index.num_segments,
+ batch_dims=inner_index.batch_dims,
+ )
+ self.outer_index = outer_index
+ self.inner_index = inner_index
+
+ def project_outer(self, index):
+ """Projects an index with the same index set onto the outer components."""
+ return IndexMap(
+ indices=tf.math.floordiv(index.indices, self.inner_index.num_segments),
+ num_segments=self.outer_index.num_segments,
+ batch_dims=index.batch_dims,
+ )
+
+ def project_inner(self, index):
+ """Projects an index with the same index set onto the inner components."""
+ return IndexMap(
+ indices=tf.math.floormod(index.indices, self.inner_index.num_segments),
+ num_segments=self.inner_index.num_segments,
+ batch_dims=index.batch_dims,
+ )
+
+
+def gather(values, index, name="segmented_gather"):
+ """
+ Gathers from `values` using the index map. For each element in the domain of the index map this operation looks up
+ a value for that index in `values`. Two elements from the same segment always get assigned the same value.
+
+ Args:
+ values: [B1, ..., Bn, num_segments, V1, ...] Tensor with segment values.
+ index: [B1, ..., Bn, I1, ..., Ik] IndexMap.
+ name: Name for the TensorFlow operation.
+
+ Returns:
+ [B1, ..., Bn, I1, ..., Ik, V1, ...] Tensor with the gathered values.
+ """
+ return tf.gather(values, index.indices, batch_dims=index.batch_dims, name=name)
+
+
+def flatten(index, name="segmented_flatten"):
+ """
+ Flattens a batched index map to a 1d index map. This operation relabels the segments to keep batch elements
+ distinct. The k-th batch element will have indices shifted by `num_segments` * (k - 1). The result is a tensor with
+ `num_segments` multiplied by the number of elements in the batch.
+
+ Args:
+ index: IndexMap to flatten.
+ name: Name for the TensorFlow operation.
+
+ Returns:
+ The flattened IndexMap.
+ """
+ batch_size = tf.reduce_prod(index.batch_shape())
+ offset = tf.range(batch_size) * index.num_segments
+ offset = tf.reshape(offset, index.batch_shape())
+ for _ in range(index.batch_dims, index.indices.shape.rank):
+ offset = tf.expand_dims(offset, -1)
+
+ indices = offset + index.indices
+ return IndexMap(indices=tf.reshape(indices, [-1]), num_segments=index.num_segments * batch_size, batch_dims=0)
+
+
+def range_index_map(batch_shape, num_segments, name="range_index_map"):
+ """
+ Constructs an index map equal to range(num_segments).
+
+ Args:
+ batch_shape (:obj:`tf.Tensor`):
+ Batch shape
+ num_segments (:obj:`int`):
+ Number of segments
+ name (:obj:`str`, `optional`, defaults to 'range_index_map'):
+ Name for the operation. Currently not used
+
+ Returns:
+ (:obj:`IndexMap`): IndexMap of shape batch_shape with elements equal to range(num_segments).
+ """
+ batch_shape = tf.convert_to_tensor(batch_shape)
+ batch_shape.shape.assert_has_rank(1)
+ num_segments = tf.convert_to_tensor(num_segments)
+ num_segments.shape.assert_has_rank(0)
+
+ indices = tf.range(num_segments)
+ shape = tf.concat([tf.ones_like(batch_shape, dtype=tf.int32), tf.expand_dims(num_segments, axis=0)], axis=0)
+ indices = tf.reshape(indices, shape)
+ multiples = tf.concat([batch_shape, [1]], axis=0)
+ indices = tf.tile(indices, multiples)
+ return IndexMap(indices=indices, num_segments=num_segments, batch_dims=batch_shape.shape.as_list()[0])
+
+
+def _segment_reduce(values, index, segment_reduce_fn, name):
+ """
+ Applies a segment reduction segment-wise.
+
+ Args:
+ values (:obj:`tf.Tensor`):
+ Tensor with segment values.
+ index (:obj:`IndexMap`):
+ IndexMap.
+ segment_reduce_fn (:obj:`str`):
+ Name for the reduce operation. One of "sum", "mean", "max" or "min".
+ name (:obj:`str`):
+ Name for the operation. Currently not used
+
+ Returns:
+ (:obj:`IndexMap`): IndexMap of shape batch_shape with elements equal to range(num_segments).
+ """
+ # Flatten the batch dimensions, as segments ops do not support batching.
+ # However if `values` has extra dimensions to the right keep them
+ # unflattened. Segmented ops support vector-valued operations.
+ flat_index = flatten(index)
+ vector_shape = tf.shape(values)[index.indices.shape.rank :]
+ flattened_shape = tf.concat([[-1], vector_shape], axis=0)
+ flat_values = tf.reshape(values, flattened_shape)
+ segment_means = segment_reduce_fn(
+ data=flat_values, segment_ids=flat_index.indices, num_segments=flat_index.num_segments
+ )
+
+ # Unflatten the values.
+ new_shape = tf.concat([index.batch_shape(), [index.num_segments], vector_shape], axis=0)
+ output_values = tf.reshape(segment_means, new_shape)
+ output_index = range_index_map(index.batch_shape(), index.num_segments)
+ return output_values, output_index
+
+
+def reduce_mean(values, index, name="segmented_reduce_mean"):
+ """
+ Averages a tensor over its segments. Outputs 0 for empty segments. This operations computes the mean over segments,
+ with support for:
+
+ - Batching using the first dimensions [B1, B2, ..., Bn]. Each element in a batch can have different indices.
+ - Vectorization using the last dimension [V1, V2, ...]. If they are present the output will be a mean of vectors
+ rather than scalars.
+ Only the middle dimensions [I1, ..., Ik] are reduced by the operation.
+
+ Args:
+ values: [B1, B2, ..., Bn, I1, .., Ik, V1, V2, ..] tensor of values to be
+ averaged.
+ index: IndexMap [B1, B2, ..., Bn, I1, .., Ik] index defining the segments.
+ name: Name for the TensorFlow ops.
+
+ Returns:
+ A pair (output_values, output_index) where `output_values` is a tensor of shape [B1, B2, ..., Bn, num_segments,
+ V1, V2, ..] and `index` is an IndexMap with shape [B1, B2, ..., Bn, num_segments].
+ """
+ return _segment_reduce(values, index, tf.math.unsorted_segment_mean, name)
+
+
+def reduce_sum(values, index, name="segmented_reduce_sum"):
+ """
+ Sums a tensor over its segments. Outputs 0 for empty segments. This operations computes the sum over segments, with
+ support for:
+
+ - Batching using the first dimensions [B1, B2, ..., Bn]. Each element in a batch can have different indices.
+ - Vectorization using the last dimension [V1, V2, ...]. If they are present the output will be a sum of vectors
+ rather than scalars.
+ Only the middle dimensions [I1, ..., Ik] are reduced by the operation.
+
+ Args:
+ values: [B1, B2, ..., Bn, I1, .., Ik, V1, V2, ..] tensor of values to be
+ averaged.
+ index: IndexMap [B1, B2, ..., Bn, I1, .., Ik] index defining the segments.
+ name: Name for the TensorFlow ops.
+
+ Returns:
+ A pair (output_values, output_index) where `output_values` is a tensor of shape [B1, B2, ..., Bn, num_segments,
+ V1, V2, ..] and `index` is an IndexMap with shape [B1, B2, ..., Bn, num_segments].
+ """
+ return _segment_reduce(values, index, tf.math.unsorted_segment_sum, name)
+
+
+def reduce_max(values, index, name="segmented_reduce_max"):
+ """
+ Computes the maximum over segments. This operations computes the maximum over segments, with support for:
+
+ - Batching using the first dimensions [B1, B2, ..., Bn]. Each element in a batch can have different indices.
+ - Vectorization using the last dimension [V1, V2, ...]. If they are present the output will be an element-wise
+ maximum of vectors rather than scalars.
+ Only the middle dimensions [I1, ..., Ik] are reduced by the operation.
+
+ Args:
+ values: [B1, B2, ..., Bn, I1, .., Ik, V1, V2, ..] tensor of values to be
+ averaged.
+ index: IndexMap [B1, B2, ..., Bn, I1, .., Ik] index defining the segments.
+ name: Name for the TensorFlow ops.
+
+ Returns:
+ A pair (output_values, output_index) where `output_values` is a tensor of shape [B1, B2, ..., Bn, num_segments,
+ V1, V2, ..] and `index` is an IndexMap with shape [B1, B2, ..., Bn, num_segments].
+ """
+ return _segment_reduce(values, index, tf.math.unsorted_segment_max, name)
+
+
+def reduce_min(values, index, name="segmented_reduce_min"):
+ """Computes the minimum over segments."""
+ return _segment_reduce(values, index, tf.math.unsorted_segment_min, name)
+
+
+def _single_column_cell_selection_loss(token_logits, column_logits, labels, cell_index, col_index, cell_mask):
+ """
+ Computes the loss for cell selection constrained to a single column. The loss is a hierarchical log-likelihood. The
+ model first predicts a column and then selects cells within that column (conditioned on the column). Cells outside
+ the selected column are never selected.
+
+ Args:
+ token_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+ Tensor containing the logits per token.
+ column_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, max_num_cols)`):
+ Tensor containing the logits per column.
+ labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+ Labels per token.
+ cell_index (:obj:`ProductIndexMap`):
+ Index that groups tokens into cells.
+ col_index (:obj:`IndexMap`):
+ Index that groups tokens into columns.
+ cell_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, max_num_rows * max_num_cols)`):
+ Mask for cells that exist in the table (i.e. that are not padding).
+
+ Returns:
+ selection_loss_per_example (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`): Loss for each example. logits
+ (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`): New logits which are only allowed to select
+ cells in a single column. Logits outside of the most likely column according to `column_logits` will be set to
+ a very low value (such that the probabilities are 0).
+ """
+ # First find the column we should select. We use the column with maximum
+ # number of selected cells.
+ labels_per_column, _ = reduce_sum(tf.cast(labels, tf.float32), col_index)
+ column_label = tf.argmax(labels_per_column, axis=-1, output_type=tf.int32)
+ # Check if there are no selected cells in the column. In that case the model
+ # should predict the special column id 0, which means "select nothing".
+ no_cell_selected = tf.equal(tf.reduce_max(labels_per_column, axis=-1), 0)
+ column_label = tf.where(no_cell_selected, tf.zeros_like(column_label), column_label)
+
+ column_dist = tfp.distributions.Categorical(logits=column_logits)
+ column_loss_per_example = -column_dist.log_prob(column_label)
+
+ # Reduce the labels and logits to per-cell from per-token.
+ logits_per_cell, _ = reduce_mean(token_logits, cell_index)
+ labels_per_cell, labels_index = reduce_max(tf.cast(labels, tf.int32), cell_index)
+
+ # Mask for the selected column.
+ column_id_for_cells = cell_index.project_inner(labels_index).indices
+ column_mask = tf.cast(tf.equal(column_id_for_cells, tf.expand_dims(column_label, axis=1)), tf.float32)
+
+ # Compute the log-likelihood for cells, but only for the selected column.
+ cell_dist = tfp.distributions.Bernoulli(logits=logits_per_cell)
+ cell_log_prob = cell_dist.log_prob(labels_per_cell)
+ cell_loss = -tf.reduce_sum(cell_log_prob * column_mask * cell_mask, axis=1)
+ # We need to normalize the loss by the number of cells in the column.
+ cell_loss /= tf.reduce_sum(column_mask * cell_mask, axis=1) + EPSILON_ZERO_DIVISION
+
+ selection_loss_per_example = column_loss_per_example
+ selection_loss_per_example += tf.where(no_cell_selected, tf.zeros_like(selection_loss_per_example), cell_loss)
+
+ # Set the probs outside the selected column (selected by the *model*)
+ # to 0. This ensures backwards compatibility with models that select
+ # cells from multiple columns.
+ selected_column_id = tf.argmax(column_logits, axis=-1, output_type=tf.int32)
+ selected_column_mask = tf.cast(
+ tf.equal(column_id_for_cells, tf.expand_dims(selected_column_id, axis=-1)), tf.float32
+ )
+ # Never select cells with the special column id 0.
+ selected_column_mask = tf.where(
+ tf.equal(column_id_for_cells, 0), tf.zeros_like(selected_column_mask), selected_column_mask
+ )
+ logits_per_cell += CLOSE_ENOUGH_TO_LOG_ZERO * (1.0 - cell_mask * selected_column_mask)
+ logits = gather(logits_per_cell, cell_index)
+
+ return selection_loss_per_example, logits
+
+
+def _calculate_aggregate_mask(answer, pooled_output, cell_selection_preference, labels, aggregation_classifier):
+ """
+ Finds examples where the model should select cells with no aggregation.
+
+ Returns a mask that determines for which examples should the model select answers directly from the table, without
+ any aggregation function. If the answer is a piece of text the case is unambiguous as aggregation functions only
+ apply to numbers. If the answer is a number but does not appear in the table then we must use some aggregation
+ case. The ambiguous case is when the answer is a number that also appears in the table. In this case we use the
+ aggregation function probabilities predicted by the model to decide whether to select or aggregate. The threshold
+ for this is a hyperparameter `cell_selection_preference`
+
+ Args:
+ answer (:obj:`tf.Tensor` of shape :obj:`(batch_size, )`):
+ Answer for every example in the batch. Nan if there is no scalar answer.
+ pooled_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`):
+ Output of the pooler (BertPooler) on top of the encoder layer.
+ cell_selection_preference (:obj:`float`):
+ Preference for cell selection in ambiguous cases.
+ labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`):
+ Labels per token. aggregation_classifier (:obj:`torch.nn.Linear`): Aggregation head
+
+ Returns:
+ aggregate_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`): A mask set to 1 for examples that should use
+ aggregation functions.
+ """
+ # tf.Tensor(batch_size,)
+ aggregate_mask_init = tf.cast(tf.logical_not(tf.math.is_nan(answer)), tf.float32)
+ logits_aggregation = aggregation_classifier(pooled_output)
+ dist_aggregation = tfp.distributions.Categorical(logits=logits_aggregation)
+ # Index 0 corresponds to "no aggregation".
+ aggregation_ops_total_mass = tf.reduce_sum(dist_aggregation.probs_parameter()[:, 1:], axis=1)
+ # Cell selection examples according to current model.
+ is_pred_cell_selection = aggregation_ops_total_mass <= cell_selection_preference
+ # Examples with non-empty cell selection supervision.
+ is_cell_supervision_available = tf.reduce_sum(labels, axis=1) > 0
+ aggregate_mask = tf.where(
+ tf.logical_and(is_pred_cell_selection, is_cell_supervision_available),
+ tf.zeros_like(aggregate_mask_init, dtype=tf.float32),
+ aggregate_mask_init,
+ )
+ aggregate_mask = tf.stop_gradient(aggregate_mask)
+ return aggregate_mask
+
+
+def _calculate_aggregation_loss_known(
+ logits_aggregation, aggregate_mask, aggregation_labels, use_answer_as_supervision, num_aggregation_labels
+):
+ """
+ Calculates aggregation loss when its type is known during training.
+
+ In the weakly supervised setting, the only known information is that for cell selection examples, "no aggregation"
+ should be predicted. For other examples (those that require aggregation), no loss is accumulated. In the setting
+ where aggregation type is always known, standard cross entropy loss is accumulated for all examples
+
+ Args:
+ logits_aggregation (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_aggregation_labels)`):
+ Logits per aggregation operation.
+ aggregate_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, )`):
+ A mask set to 1 for examples that should use aggregation functions.
+ aggregation_labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, )`):
+ Aggregation function id for every example in the batch.
+ use_answer_as_supervision (:obj:`bool`, `optional`):
+ Whether to use the answer as the only supervision for aggregation examples.
+ num_aggregation_labels (:obj:`int`, `optional`, defaults to 0):
+ The number of aggregation operators to predict.
+
+ Returns:
+ aggregation_loss_known (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`): Aggregation loss (when its type is
+ known during training) per example.
+ """
+ if use_answer_as_supervision:
+ # Prepare "no aggregation" targets for cell selection examples.
+ target_aggregation = tf.zeros_like(aggregate_mask, dtype=tf.int32)
+ else:
+ # Use aggregation supervision as the target.
+ target_aggregation = aggregation_labels
+
+ one_hot_labels = tf.one_hot(target_aggregation, depth=num_aggregation_labels, dtype=tf.float32)
+ log_probs = tf.nn.log_softmax(logits_aggregation, axis=-1)
+
+ # [batch_size]
+ per_example_aggregation_intermediate = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1)
+ if use_answer_as_supervision:
+ # Accumulate loss only for examples requiring cell selection
+ # (no aggregation).
+ return per_example_aggregation_intermediate * (1 - aggregate_mask)
+ else:
+ return per_example_aggregation_intermediate
+
+
+def _calculate_aggregation_loss_unknown(logits_aggregation, aggregate_mask):
+ """
+ Calculates aggregation loss in the case of answer supervision.
+
+ Args:
+ logits_aggregation (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_aggregation_labels)`):
+ Logits per aggregation operation.
+ aggregate_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, )`):
+ A mask set to 1 for examples that should use aggregation functions
+
+ Returns:
+ aggregation_loss_unknown (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`): Aggregation loss (in case of answer
+ supervision) per example.
+ """
+ dist_aggregation = tfp.distributions.Categorical(logits=logits_aggregation)
+ # Index 0 corresponds to "no aggregation".
+ aggregation_ops_total_mass = tf.reduce_sum(dist_aggregation.probs_parameter()[:, 1:], axis=1)
+ # Predict some aggregation in case of an answer that needs aggregation.
+ # This increases the probability of all aggregation functions, in a way
+ # similar to MML, but without considering whether the function gives the
+ # correct answer.
+ return -tf.math.log(aggregation_ops_total_mass) * aggregate_mask
+
+
+def _calculate_aggregation_loss(
+ logits_aggregation,
+ aggregate_mask,
+ aggregation_labels,
+ use_answer_as_supervision,
+ num_aggregation_labels,
+ aggregation_loss_weight,
+):
+ """
+ Calculates the aggregation loss per example.
+
+ Args:
+ logits_aggregation (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_aggregation_labels)`):
+ Logits per aggregation operation.
+ aggregate_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, )`):
+ A mask set to 1 for examples that should use aggregation functions.
+ aggregation_labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, )`):
+ Aggregation function id for every example in the batch.
+ use_answer_as_supervision (:obj:`bool`, `optional`):
+ Whether to use the answer as the only supervision for aggregation examples.
+ num_aggregation_labels (:obj:`int`, `optional`, defaults to 0):
+ The number of aggregation operators to predict.
+ aggregation_loss_weight (:obj:`float`, `optional`, defaults to 1.0):
+ Importance weight for the aggregation loss.
+
+ Returns:
+ aggregation_loss (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`): Aggregation loss per example.
+ """
+ per_example_aggregation_loss = _calculate_aggregation_loss_known(
+ logits_aggregation, aggregate_mask, aggregation_labels, use_answer_as_supervision, num_aggregation_labels
+ )
+
+ if use_answer_as_supervision:
+ # Add aggregation loss for numeric answers that need aggregation.
+ per_example_aggregation_loss += _calculate_aggregation_loss_unknown(logits_aggregation, aggregate_mask)
+ return aggregation_loss_weight * per_example_aggregation_loss
+
+
+def _calculate_expected_result(
+ dist_per_cell, numeric_values, numeric_values_scale, input_mask_float, logits_aggregation, config
+):
+ """
+ Calculates the expected result given cell and aggregation probabilities.
+
+ Args:
+ dist_per_cell (:obj:`tfp.distributions.Bernoulli`):
+ Cell selection distribution for each cell.
+ numeric_values (:obj:`tf.Tensor` of shape :obj:`(batch_size, seq_length)`):
+ Numeric values of every token. Nan for tokens which are not numeric values.
+ numeric_values_scale (:obj:`tf.Tensor` of shape :obj:`(batch_size, seq_length)`):
+ Scale of the numeric values of every token.
+ input_mask_float (:obj: `tf.Tensor` of shape :obj:`(batch_size, seq_length)`):
+ Mask for the table, without question tokens and table headers.
+ logits_aggregation (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_aggregation_labels)`):
+ Logits per aggregation operation.
+ config (:class:`~transformers.TapasConfig`):
+ Model configuration class with all the hyperparameters of the model
+
+ Returns:
+ expected_result (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`): The expected result per example.
+ """
+ if config.use_gumbel_for_cells:
+ gumbel_dist = tfp.distributions.RelaxedBernoulli(
+ # The token logits where already divided by the temperature and used for
+ # computing cell selection errors so we need to multiply it again here
+ config.temperature,
+ logits=dist_per_cell.logits_parameter() * config.temperature,
+ )
+ scaled_probability_per_cell = gumbel_dist.sample()
+ else:
+ scaled_probability_per_cell = dist_per_cell.probs_parameter()
+
+ # [batch_size, seq_length]
+ scaled_probability_per_cell = (scaled_probability_per_cell / numeric_values_scale) * input_mask_float
+ count_result = tf.reduce_sum(scaled_probability_per_cell, axis=1)
+ numeric_values_masked = tf.where(
+ tf.math.is_nan(numeric_values), tf.zeros_like(numeric_values), numeric_values
+ ) # Mask non-numeric table values to zero.
+ sum_result = tf.reduce_sum(scaled_probability_per_cell * numeric_values_masked, axis=1)
+ avg_approximation = config.average_approximation_function
+ if avg_approximation == AverageApproximationFunction.RATIO:
+ average_result = sum_result / (count_result + EPSILON_ZERO_DIVISION)
+ elif avg_approximation == AverageApproximationFunction.FIRST_ORDER:
+ # The sum of all probabilities exept that correspond to other cells
+ ex = tf.reduce_sum(scaled_probability_per_cell, axis=1, keepdims=True) - scaled_probability_per_cell + 1
+ average_result = tf.reduce_sum(numeric_values_masked * scaled_probability_per_cell / ex, axis=1)
+ elif avg_approximation == AverageApproximationFunction.SECOND_ORDER:
+ # The sum of all probabilities exept that correspond to other cells
+ ex = tf.reduce_sum(scaled_probability_per_cell, axis=1, keepdims=True) - scaled_probability_per_cell + 1
+ pointwise_var = scaled_probability_per_cell * (1 - scaled_probability_per_cell)
+ var = tf.reduce_sum(pointwise_var, axis=1, keepdims=True) - pointwise_var
+ multiplier = (var / tf.math.square(ex) + 1) / ex
+ average_result = tf.reduce_sum(numeric_values_masked * scaled_probability_per_cell * multiplier, axis=1)
+ else:
+ raise ValueError("Invalid average_approximation_function: %s", config.average_approximation_function)
+
+ if config.use_gumbel_for_aggregation:
+ gumbel_dist = tfp.distributions.RelaxedOneHotCategorical(
+ config.aggregation_temperature, logits=logits_aggregation[:, 1:]
+ )
+ # [batch_size, num_aggregation_labels - 1]
+ aggregation_op_only_probs = gumbel_dist.sample()
+ else:
+ # [batch_size, num_aggregation_labels - 1]
+ aggregation_op_only_probs = tf.nn.softmax(logits_aggregation[:, 1:] / config.aggregation_temperature, axis=-1)
+ all_results = tf.concat(
+ [
+ tf.expand_dims(sum_result, axis=1),
+ tf.expand_dims(average_result, axis=1),
+ tf.expand_dims(count_result, axis=1),
+ ],
+ axis=1,
+ )
+ expected_result = tf.reduce_sum(all_results * aggregation_op_only_probs, axis=1)
+ return expected_result
+
+
+def _calculate_regression_loss(
+ answer,
+ aggregate_mask,
+ dist_per_cell,
+ numeric_values,
+ numeric_values_scale,
+ input_mask_float,
+ logits_aggregation,
+ config,
+):
+ """
+ Calculates the regression loss per example.
+
+ Args:
+ answer (:obj: `tf.Tensor` of shape :obj:`(batch_size,)`):
+ Answer for every example in the batch. Nan if there is no scalar answer.
+ aggregate_mask (:obj: `tf.Tensor` of shape :obj:`(batch_size,)`):
+ A mask set to 1 for examples that should use aggregation functions.
+ dist_per_cell (:obj:`torch.distributions.Bernoulli`):
+ Cell selection distribution for each cell.
+ numeric_values (:obj:`tf.Tensor` of shape :obj:`(batch_size, seq_length)`):
+ Numeric values of every token. Nan for tokens which are not numeric values.
+ numeric_values_scale (:obj:`tf.Tensor` of shape :obj:`(batch_size, seq_length)`):
+ Scale of the numeric values of every token.
+ input_mask_float (:obj: `tf.Tensor` of shape :obj:`(batch_size, seq_length)`):
+ Mask for the table, without question tokens and table headers.
+ logits_aggregation (:obj: `tf.Tensor` of shape :obj:`(batch_size, num_aggregation_labels)`):
+ Logits per aggregation operation.
+ config (:class:`~transformers.TapasConfig`):
+ Model configuration class with all the parameters of the model
+
+ Returns:
+ per_example_answer_loss_scaled (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`): Scales answer loss for each
+ example in the batch. large_answer_loss_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`): A mask which is
+ 1 for examples for which their answer loss is larger than the answer_loss_cutoff.
+ """
+ # float32 (batch_size,)
+ expected_result = _calculate_expected_result(
+ dist_per_cell, numeric_values, numeric_values_scale, input_mask_float, logits_aggregation, config
+ )
+
+ # [batch_size]
+ answer_masked = tf.where(tf.math.is_nan(answer), tf.zeros_like(answer), answer)
+
+ if config.use_normalized_answer_loss:
+ normalizer = tf.stop_gradient(
+ tf.math.maximum(tf.math.abs(expected_result), tf.math.abs(answer_masked)) + EPSILON_ZERO_DIVISION
+ )
+ normalized_answer_masked = answer_masked / normalizer
+ normalized_expected_result = expected_result / normalizer
+ per_example_answer_loss = tf.compat.v1.losses.huber_loss(
+ normalized_answer_masked * aggregate_mask,
+ normalized_expected_result * aggregate_mask,
+ delta=tf.cast(1.0, tf.float32),
+ reduction=tf.losses.Reduction.NONE,
+ )
+ else:
+ per_example_answer_loss = tf.compat.v1.losses.huber_loss(
+ answer_masked * aggregate_mask,
+ expected_result * aggregate_mask,
+ delta=tf.cast(config.huber_loss_delta, tf.float32),
+ reduction=tf.losses.Reduction.NONE,
+ )
+ if config.answer_loss_cutoff is None:
+ large_answer_loss_mask = tf.ones_like(per_example_answer_loss, dtype=tf.float32)
+ else:
+ large_answer_loss_mask = tf.where(
+ per_example_answer_loss > config.answer_loss_cutoff,
+ tf.zeros_like(per_example_answer_loss, dtype=tf.float32),
+ tf.ones_like(per_example_answer_loss, dtype=tf.float32),
+ )
+ per_example_answer_loss_scaled = config.answer_loss_importance * (per_example_answer_loss * aggregate_mask)
+ return per_example_answer_loss_scaled, large_answer_loss_mask
diff --git a/src/transformers/models/tapas/tokenization_tapas.py b/src/transformers/models/tapas/tokenization_tapas.py
index 3bd7a00d35..341d32ee19 100644
--- a/src/transformers/models/tapas/tokenization_tapas.py
+++ b/src/transformers/models/tapas/tokenization_tapas.py
@@ -1897,9 +1897,9 @@ class TapasTokenizer(PreTrainedTokenizer):
data (:obj:`dict`):
Dictionary mapping features to actual values. Should be created using
:class:`~transformers.TapasTokenizer`.
- logits (:obj:`np.ndarray` of shape ``(batch_size, sequence_length)``):
+ logits (:obj:`torch.Tensor` or :obj:`tf.Tensor` of shape ``(batch_size, sequence_length)``):
Tensor containing the logits at the token level.
- logits_agg (:obj:`np.ndarray` of shape ``(batch_size, num_aggregation_labels)``, `optional`):
+ logits_agg (:obj:`torch.Tensor` or :obj:`tf.Tensor` of shape ``(batch_size, num_aggregation_labels)``, `optional`):
Tensor containing the aggregation logits.
cell_classification_threshold (:obj:`float`, `optional`, defaults to 0.5):
Threshold to be used for cell selection. All table cells for which their probability is larger than
@@ -1915,6 +1915,11 @@ class TapasTokenizer(PreTrainedTokenizer):
- predicted_aggregation_indices (``List[int]``of length ``batch_size``, `optional`, returned when
``logits_aggregation`` is provided): Predicted aggregation operator indices of the aggregation head.
"""
+ # converting to numpy arrays to work with PT/TF
+ logits = logits.numpy()
+ if logits_agg is not None:
+ logits_agg = logits_agg.numpy()
+ data = {key: value.numpy() for key, value in data.items() if key != "training"}
# input data is of type float32
# np.log(np.finfo(np.float32).max) = 88.72284
# Any value over 88.72284 will overflow when passed through the exponential, sending a warning
@@ -1975,7 +1980,7 @@ class TapasTokenizer(PreTrainedTokenizer):
output = (predicted_answer_coordinates,)
if logits_agg is not None:
- predicted_aggregation_indices = logits_agg.argmax(dim=-1)
+ predicted_aggregation_indices = logits_agg.argmax(axis=-1)
output = (predicted_answer_coordinates, predicted_aggregation_indices.tolist())
return output
diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py
index afbd41e615..90c718780d 100755
--- a/src/transformers/pipelines/__init__.py
+++ b/src/transformers/pipelines/__init__.py
@@ -78,6 +78,7 @@ if is_tf_available():
TFAutoModelForQuestionAnswering,
TFAutoModelForSeq2SeqLM,
TFAutoModelForSequenceClassification,
+ TFAutoModelForTableQuestionAnswering,
TFAutoModelForTokenClassification,
)
@@ -170,7 +171,7 @@ SUPPORTED_TASKS = {
"table-question-answering": {
"impl": TableQuestionAnsweringPipeline,
"pt": (AutoModelForTableQuestionAnswering,) if is_torch_available() else (),
- "tf": (),
+ "tf": (TFAutoModelForTableQuestionAnswering,) if is_tf_available() else (),
"default": {
"model": {
"pt": "google/tapas-base-finetuned-wtq",
diff --git a/src/transformers/pipelines/table_question_answering.py b/src/transformers/pipelines/table_question_answering.py
index 7697752b2b..1ec93d1160 100644
--- a/src/transformers/pipelines/table_question_answering.py
+++ b/src/transformers/pipelines/table_question_answering.py
@@ -2,7 +2,13 @@ import collections
import numpy as np
-from ..file_utils import add_end_docstrings, is_torch_available, requires_backends
+from ..file_utils import (
+ add_end_docstrings,
+ is_tensorflow_probability_available,
+ is_tf_available,
+ is_torch_available,
+ requires_backends,
+)
from .base import PIPELINE_INIT_ARGS, ArgumentHandler, Pipeline, PipelineException
@@ -11,6 +17,13 @@ if is_torch_available():
from ..models.auto.modeling_auto import MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING
+if is_tf_available() and is_tensorflow_probability_available():
+ import tensorflow as tf
+
+ import tensorflow_probability as tfp
+
+ from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING
+
class TableQuestionAnsweringArgumentHandler(ArgumentHandler):
"""
@@ -83,10 +96,11 @@ class TableQuestionAnsweringPipeline(Pipeline):
super().__init__(*args, **kwargs)
self._args_parser = args_parser
- if self.framework == "tf":
- raise ValueError("The TableQuestionAnsweringPipeline is only available in PyTorch.")
-
- self.check_model_type(MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING)
+ self.check_model_type(
+ TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING
+ if self.framework == "tf"
+ else MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING
+ )
self.aggregate = bool(getattr(self.model.config, "aggregation_labels")) and bool(
getattr(self.model.config, "num_aggregation_labels")
@@ -100,67 +114,129 @@ class TableQuestionAnsweringPipeline(Pipeline):
Inference used for models that need to process sequences in a sequential fashion, like the SQA models which
handle conversational query related to a table.
"""
- all_logits = []
- all_aggregations = []
- prev_answers = None
- batch_size = inputs["input_ids"].shape[0]
+ if self.framework == "pt":
+ all_logits = []
+ all_aggregations = []
+ prev_answers = None
+ batch_size = inputs["input_ids"].shape[0]
- input_ids = inputs["input_ids"].to(self.device)
- attention_mask = inputs["attention_mask"].to(self.device)
- token_type_ids = inputs["token_type_ids"].to(self.device)
- token_type_ids_example = None
+ input_ids = inputs["input_ids"].to(self.device)
+ attention_mask = inputs["attention_mask"].to(self.device)
+ token_type_ids = inputs["token_type_ids"].to(self.device)
+ token_type_ids_example = None
- for index in range(batch_size):
- # If sequences have already been processed, the token type IDs will be created according to the previous
- # answer.
- if prev_answers is not None:
- prev_labels_example = token_type_ids_example[:, 3] # shape (seq_len,)
- model_labels = np.zeros_like(prev_labels_example.cpu().numpy()) # shape (seq_len,)
+ for index in range(batch_size):
+ # If sequences have already been processed, the token type IDs will be created according to the previous
+ # answer.
+ if prev_answers is not None:
+ prev_labels_example = token_type_ids_example[:, 3] # shape (seq_len,)
+ model_labels = np.zeros_like(prev_labels_example.cpu().numpy()) # shape (seq_len,)
+ token_type_ids_example = token_type_ids[index] # shape (seq_len, 7)
+ for i in range(model_labels.shape[0]):
+ segment_id = token_type_ids_example[:, 0].tolist()[i]
+ col_id = token_type_ids_example[:, 1].tolist()[i] - 1
+ row_id = token_type_ids_example[:, 2].tolist()[i] - 1
+
+ if row_id >= 0 and col_id >= 0 and segment_id == 1:
+ model_labels[i] = int(prev_answers[(col_id, row_id)])
+
+ token_type_ids_example[:, 3] = torch.from_numpy(model_labels).type(torch.long).to(self.device)
+
+ input_ids_example = input_ids[index]
+ attention_mask_example = attention_mask[index] # shape (seq_len,)
token_type_ids_example = token_type_ids[index] # shape (seq_len, 7)
- for i in range(model_labels.shape[0]):
+ outputs = self.model(
+ input_ids=input_ids_example.unsqueeze(0),
+ attention_mask=attention_mask_example.unsqueeze(0),
+ token_type_ids=token_type_ids_example.unsqueeze(0),
+ )
+ logits = outputs.logits
+
+ if self.aggregate:
+ all_aggregations.append(outputs.logits_aggregation)
+
+ all_logits.append(logits)
+
+ dist_per_token = torch.distributions.Bernoulli(logits=logits)
+ probabilities = dist_per_token.probs * attention_mask_example.type(torch.float32).to(
+ dist_per_token.probs.device
+ )
+
+ coords_to_probs = collections.defaultdict(list)
+ for i, p in enumerate(probabilities.squeeze().tolist()):
segment_id = token_type_ids_example[:, 0].tolist()[i]
- col_id = token_type_ids_example[:, 1].tolist()[i] - 1
- row_id = token_type_ids_example[:, 2].tolist()[i] - 1
+ col = token_type_ids_example[:, 1].tolist()[i] - 1
+ row = token_type_ids_example[:, 2].tolist()[i] - 1
+ if col >= 0 and row >= 0 and segment_id == 1:
+ coords_to_probs[(col, row)].append(p)
- if row_id >= 0 and col_id >= 0 and segment_id == 1:
- model_labels[i] = int(prev_answers[(col_id, row_id)])
+ prev_answers = {key: np.array(coords_to_probs[key]).mean() > 0.5 for key in coords_to_probs}
- token_type_ids_example[:, 3] = torch.from_numpy(model_labels).type(torch.long).to(self.device)
+ logits_batch = torch.cat(tuple(all_logits), 0)
- input_ids_example = input_ids[index]
- attention_mask_example = attention_mask[index] # shape (seq_len,)
- token_type_ids_example = token_type_ids[index] # shape (seq_len, 7)
- outputs = self.model(
- input_ids=input_ids_example.unsqueeze(0),
- attention_mask=attention_mask_example.unsqueeze(0),
- token_type_ids=token_type_ids_example.unsqueeze(0),
- )
- logits = outputs.logits
+ return (logits_batch,) if not self.aggregate else (logits_batch, torch.cat(tuple(all_aggregations), 0))
+ else:
+ all_logits = []
+ all_aggregations = []
+ prev_answers = None
+ batch_size = inputs["input_ids"].shape[0]
- if self.aggregate:
- all_aggregations.append(outputs.logits_aggregation)
+ input_ids = inputs["input_ids"]
+ attention_mask = inputs["attention_mask"]
+ token_type_ids = inputs["token_type_ids"].numpy()
+ token_type_ids_example = None
- all_logits.append(logits)
+ for index in range(batch_size):
+ # If sequences have already been processed, the token type IDs will be created according to the previous
+ # answer.
+ if prev_answers is not None:
+ prev_labels_example = token_type_ids_example[:, 3] # shape (seq_len,)
+ model_labels = np.zeros_like(prev_labels_example, dtype=np.int32) # shape (seq_len,)
- dist_per_token = torch.distributions.Bernoulli(logits=logits)
- probabilities = dist_per_token.probs * attention_mask_example.type(torch.float32).to(
- dist_per_token.probs.device
- )
+ token_type_ids_example = token_type_ids[index] # shape (seq_len, 7)
+ for i in range(model_labels.shape[0]):
+ segment_id = token_type_ids_example[:, 0].tolist()[i]
+ col_id = token_type_ids_example[:, 1].tolist()[i] - 1
+ row_id = token_type_ids_example[:, 2].tolist()[i] - 1
- coords_to_probs = collections.defaultdict(list)
- for i, p in enumerate(probabilities.squeeze().tolist()):
- segment_id = token_type_ids_example[:, 0].tolist()[i]
- col = token_type_ids_example[:, 1].tolist()[i] - 1
- row = token_type_ids_example[:, 2].tolist()[i] - 1
- if col >= 0 and row >= 0 and segment_id == 1:
- coords_to_probs[(col, row)].append(p)
+ if row_id >= 0 and col_id >= 0 and segment_id == 1:
+ model_labels[i] = int(prev_answers[(col_id, row_id)])
- prev_answers = {key: np.array(coords_to_probs[key]).mean() > 0.5 for key in coords_to_probs}
+ token_type_ids_example[:, 3] = model_labels
- logits_batch = torch.cat(tuple(all_logits), 0)
+ input_ids_example = input_ids[index]
+ attention_mask_example = attention_mask[index] # shape (seq_len,)
+ token_type_ids_example = token_type_ids[index] # shape (seq_len, 7)
+ outputs = self.model(
+ input_ids=np.expand_dims(input_ids_example, axis=0),
+ attention_mask=np.expand_dims(attention_mask_example, axis=0),
+ token_type_ids=np.expand_dims(token_type_ids_example, axis=0),
+ )
+ logits = outputs.logits
- return (logits_batch,) if not self.aggregate else (logits_batch, torch.cat(tuple(all_aggregations), 0))
+ if self.aggregate:
+ all_aggregations.append(outputs.logits_aggregation)
+
+ all_logits.append(logits)
+
+ dist_per_token = tfp.distributions.Bernoulli(logits=logits)
+ probabilities = dist_per_token.probs_parameter() * tf.cast(attention_mask_example, tf.float32)
+
+ coords_to_probs = collections.defaultdict(list)
+ token_type_ids_example = token_type_ids_example
+ for i, p in enumerate(tf.squeeze(probabilities).numpy().tolist()):
+ segment_id = token_type_ids_example[:, 0].tolist()[i]
+ col = token_type_ids_example[:, 1].tolist()[i] - 1
+ row = token_type_ids_example[:, 2].tolist()[i] - 1
+ if col >= 0 and row >= 0 and segment_id == 1:
+ coords_to_probs[(col, row)].append(p)
+
+ prev_answers = {key: np.array(coords_to_probs[key]).mean() > 0.5 for key in coords_to_probs}
+
+ logits_batch = tf.concat(tuple(all_logits), 0)
+
+ return (logits_batch,) if not self.aggregate else (logits_batch, tf.concat(tuple(all_aggregations), 0))
def __call__(self, *args, **kwargs):
r"""
@@ -274,7 +350,7 @@ class TableQuestionAnsweringPipeline(Pipeline):
outputs = model_outputs["outputs"]
if self.aggregate:
logits, logits_agg = outputs[:2]
- predictions = self.tokenizer.convert_logits_to_predictions(inputs, logits.detach(), logits_agg)
+ predictions = self.tokenizer.convert_logits_to_predictions(inputs, logits, logits_agg)
answer_coordinates_batch, agg_predictions = predictions
aggregators = {i: self.model.config.aggregation_labels[pred] for i, pred in enumerate(agg_predictions)}
@@ -284,7 +360,7 @@ class TableQuestionAnsweringPipeline(Pipeline):
}
else:
logits = outputs[0]
- predictions = self.tokenizer.convert_logits_to_predictions(inputs, logits.detach())
+ predictions = self.tokenizer.convert_logits_to_predictions(inputs, logits)
answer_coordinates_batch = predictions[0]
aggregators = {}
aggregators_prefix = {}
diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py
index bbfdaedfb7..eebbe737e4 100644
--- a/src/transformers/testing_utils.py
+++ b/src/transformers/testing_utils.py
@@ -44,6 +44,7 @@ from .file_utils import (
is_scatter_available,
is_sentencepiece_available,
is_soundfile_availble,
+ is_tensorflow_probability_available,
is_tf_available,
is_timm_available,
is_tokenizers_available,
@@ -292,6 +293,19 @@ def require_torch_scatter(test_case):
return test_case
+def require_tensorflow_probability(test_case):
+ """
+ Decorator marking a test that requires TensorFlow probability.
+
+ These tests are skipped when TensorFlow probability isn't installed.
+
+ """
+ if not is_tensorflow_probability_available():
+ return unittest.skip("test requires TensorFlow probability")(test_case)
+ else:
+ return test_case
+
+
def require_torchaudio(test_case):
"""
Decorator marking a test that requires torchaudio. These tests are skipped when torchaudio isn't installed.
diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py
index 14991e8b6a..ca564457a2 100644
--- a/src/transformers/utils/dummy_tf_objects.py
+++ b/src/transformers/utils/dummy_tf_objects.py
@@ -239,6 +239,9 @@ TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = None
TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = None
+TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING = None
+
+
TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = None
@@ -356,6 +359,18 @@ class TFAutoModelForSequenceClassification:
requires_backends(self, ["tf"])
+class TFAutoModelForTableQuestionAnswering:
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["tf"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["tf"])
+
+ def call(self, *args, **kwargs):
+ requires_backends(self, ["tf"])
+
+
class TFAutoModelForTokenClassification:
def __init__(self, *args, **kwargs):
requires_backends(self, ["tf"])
@@ -2488,6 +2503,69 @@ class TFT5PreTrainedModel:
requires_backends(self, ["tf"])
+TF_TAPAS_PRETRAINED_MODEL_ARCHIVE_LIST = None
+
+
+class TFTapasForMaskedLM:
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["tf"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["tf"])
+
+ def call(self, *args, **kwargs):
+ requires_backends(self, ["tf"])
+
+
+class TFTapasForQuestionAnswering:
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["tf"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["tf"])
+
+ def call(self, *args, **kwargs):
+ requires_backends(self, ["tf"])
+
+
+class TFTapasForSequenceClassification:
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["tf"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["tf"])
+
+ def call(self, *args, **kwargs):
+ requires_backends(self, ["tf"])
+
+
+class TFTapasModel:
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["tf"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["tf"])
+
+ def call(self, *args, **kwargs):
+ requires_backends(self, ["tf"])
+
+
+class TFTapasPreTrainedModel:
+ def __init__(self, *args, **kwargs):
+ requires_backends(self, ["tf"])
+
+ @classmethod
+ def from_pretrained(cls, *args, **kwargs):
+ requires_backends(cls, ["tf"])
+
+ def call(self, *args, **kwargs):
+ requires_backends(self, ["tf"])
+
+
TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = None
diff --git a/tests/test_modeling_tf_auto.py b/tests/test_modeling_tf_auto.py
index bad3353cc5..bc3a7a6a87 100644
--- a/tests/test_modeling_tf_auto.py
+++ b/tests/test_modeling_tf_auto.py
@@ -17,8 +17,14 @@ import copy
import tempfile
import unittest
-from transformers import CONFIG_MAPPING, AutoConfig, BertConfig, GPT2Config, T5Config, is_tf_available
-from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER, SMALL_MODEL_IDENTIFIER, require_tf, slow
+from transformers import CONFIG_MAPPING, AutoConfig, BertConfig, GPT2Config, T5Config, TapasConfig, is_tf_available
+from transformers.testing_utils import (
+ DUMMY_UNKNOWN_IDENTIFIER,
+ SMALL_MODEL_IDENTIFIER,
+ require_tensorflow_probability,
+ require_tf,
+ slow,
+)
from .test_modeling_bert import BertModelTester
@@ -32,6 +38,7 @@ if is_tf_available():
TFAutoModelForQuestionAnswering,
TFAutoModelForSeq2SeqLM,
TFAutoModelForSequenceClassification,
+ TFAutoModelForTableQuestionAnswering,
TFAutoModelForTokenClassification,
TFAutoModelWithLMHead,
TFBertForMaskedLM,
@@ -44,6 +51,7 @@ if is_tf_available():
TFGPT2LMHeadModel,
TFRobertaForMaskedLM,
TFT5ForConditionalGeneration,
+ TFTapasForQuestionAnswering,
)
from transformers.models.auto.modeling_tf_auto import (
TF_MODEL_FOR_CAUSAL_LM_MAPPING,
@@ -52,6 +60,7 @@ if is_tf_available():
TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
+ TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
TF_MODEL_MAPPING,
TF_MODEL_WITH_LM_HEAD_MAPPING,
@@ -59,6 +68,7 @@ if is_tf_available():
from transformers.models.bert.modeling_tf_bert import TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST
from transformers.models.gpt2.modeling_tf_gpt2 import TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST
from transformers.models.t5.modeling_tf_t5 import TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST
+ from transformers.models.tapas.modeling_tf_tapas import TF_TAPAS_PRETRAINED_MODEL_ARCHIVE_LIST
class NewModelConfig(BertConfig):
@@ -176,6 +186,21 @@ class TFAutoModelTest(unittest.TestCase):
self.assertIsNotNone(model)
self.assertIsInstance(model, TFBertForQuestionAnswering)
+ @slow
+ @require_tensorflow_probability
+ def test_table_question_answering_model_from_pretrained(self):
+ for model_name in TF_TAPAS_PRETRAINED_MODEL_ARCHIVE_LIST[5:6]:
+ config = AutoConfig.from_pretrained(model_name)
+ self.assertIsNotNone(config)
+ self.assertIsInstance(config, TapasConfig)
+
+ model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_name)
+ model, loading_info = TFAutoModelForTableQuestionAnswering.from_pretrained(
+ model_name, output_loading_info=True
+ )
+ self.assertIsNotNone(model)
+ self.assertIsInstance(model, TFTapasForQuestionAnswering)
+
def test_from_pretrained_identifier(self):
model = TFAutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER)
self.assertIsInstance(model, TFBertForMaskedLM)
@@ -210,6 +235,7 @@ class TFAutoModelTest(unittest.TestCase):
TF_MODEL_MAPPING,
TF_MODEL_FOR_PRETRAINING_MAPPING,
TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING,
+ TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
TF_MODEL_WITH_LM_HEAD_MAPPING,
diff --git a/tests/test_modeling_tf_tapas.py b/tests/test_modeling_tf_tapas.py
new file mode 100644
index 0000000000..ca44781973
--- /dev/null
+++ b/tests/test_modeling_tf_tapas.py
@@ -0,0 +1,1036 @@
+# coding=utf-8
+# Copyright 2021 The HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import copy
+import unittest
+
+import numpy as np
+import pandas as pd
+
+from transformers import (
+ TF_MODEL_FOR_CAUSAL_LM_MAPPING,
+ TF_MODEL_FOR_MASKED_LM_MAPPING,
+ TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING,
+ TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING,
+ TF_MODEL_FOR_PRETRAINING_MAPPING,
+ TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING,
+ TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING,
+ TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING,
+ TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING,
+ TapasConfig,
+ TapasTokenizer,
+ is_tf_available,
+)
+from transformers.file_utils import cached_property
+from transformers.models.auto import get_values
+from transformers.testing_utils import require_tensorflow_probability, require_tf, slow
+
+from .test_configuration_common import ConfigTester
+from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor
+
+
+if is_tf_available():
+ import tensorflow as tf
+
+ from transformers import (
+ TFTapasForMaskedLM,
+ TFTapasForQuestionAnswering,
+ TFTapasForSequenceClassification,
+ TFTapasModel,
+ )
+ from transformers.models.tapas.modeling_tf_tapas import (
+ IndexMap,
+ ProductIndexMap,
+ flatten,
+ gather,
+ range_index_map,
+ reduce_max,
+ reduce_mean,
+ reduce_sum,
+ )
+
+
+class TFTapasModelTester:
+ def __init__(
+ self,
+ parent,
+ batch_size=13,
+ seq_length=7,
+ is_training=True,
+ use_input_mask=True,
+ use_token_type_ids=True,
+ use_labels=True,
+ vocab_size=99,
+ hidden_size=32,
+ num_hidden_layers=5,
+ num_attention_heads=4,
+ intermediate_size=37,
+ hidden_act="gelu",
+ hidden_dropout_prob=0.1,
+ attention_probs_dropout_prob=0.1,
+ initializer_range=0.02,
+ max_position_embeddings=512,
+ type_vocab_sizes=[3, 256, 256, 2, 256, 256, 10],
+ type_sequence_label_size=2,
+ positive_weight=10.0,
+ num_aggregation_labels=4,
+ num_labels=2,
+ aggregation_loss_importance=0.8,
+ use_answer_as_supervision=True,
+ answer_loss_importance=0.001,
+ use_normalized_answer_loss=False,
+ huber_loss_delta=25.0,
+ temperature=1.0,
+ agg_temperature=1.0,
+ use_gumbel_for_cells=False,
+ use_gumbel_for_agg=False,
+ average_approximation_function="ratio",
+ cell_selection_preference=0.5,
+ answer_loss_cutoff=100,
+ max_num_rows=64,
+ max_num_columns=32,
+ average_logits_per_cell=True,
+ select_one_column=True,
+ allow_empty_column_selection=False,
+ init_cell_selection_weights_to_zero=True,
+ reset_position_index_per_cell=True,
+ disable_per_token_loss=False,
+ scope=None,
+ ):
+ self.parent = parent
+ self.batch_size = batch_size
+ self.seq_length = seq_length
+ self.is_training = is_training
+ self.use_input_mask = use_input_mask
+ self.use_token_type_ids = use_token_type_ids
+ self.use_labels = use_labels
+ self.vocab_size = vocab_size
+ self.hidden_size = hidden_size
+ self.num_hidden_layers = num_hidden_layers
+ self.num_attention_heads = num_attention_heads
+ self.intermediate_size = intermediate_size
+ self.hidden_act = hidden_act
+ self.hidden_dropout_prob = hidden_dropout_prob
+ self.attention_probs_dropout_prob = attention_probs_dropout_prob
+ self.initializer_range = initializer_range
+ self.max_position_embeddings = max_position_embeddings
+ self.type_vocab_sizes = type_vocab_sizes
+ self.type_sequence_label_size = type_sequence_label_size
+ self.positive_weight = positive_weight
+ self.num_aggregation_labels = num_aggregation_labels
+ self.num_labels = num_labels
+ self.aggregation_loss_importance = aggregation_loss_importance
+ self.use_answer_as_supervision = use_answer_as_supervision
+ self.answer_loss_importance = answer_loss_importance
+ self.use_normalized_answer_loss = use_normalized_answer_loss
+ self.huber_loss_delta = huber_loss_delta
+ self.temperature = temperature
+ self.agg_temperature = agg_temperature
+ self.use_gumbel_for_cells = use_gumbel_for_cells
+ self.use_gumbel_for_agg = use_gumbel_for_agg
+ self.average_approximation_function = average_approximation_function
+ self.cell_selection_preference = cell_selection_preference
+ self.answer_loss_cutoff = answer_loss_cutoff
+ self.max_num_rows = max_num_rows
+ self.max_num_columns = max_num_columns
+ self.average_logits_per_cell = average_logits_per_cell
+ self.select_one_column = select_one_column
+ self.allow_empty_column_selection = allow_empty_column_selection
+ self.init_cell_selection_weights_to_zero = init_cell_selection_weights_to_zero
+ self.reset_position_index_per_cell = reset_position_index_per_cell
+ self.disable_per_token_loss = disable_per_token_loss
+ self.scope = scope
+
+ def prepare_config_and_inputs(self):
+ input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size)
+
+ input_mask = None
+ if self.use_input_mask:
+ input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+
+ token_type_ids = []
+ for type_vocab_size in self.type_vocab_sizes:
+ token_type_ids.append(ids_tensor(shape=[self.batch_size, self.seq_length], vocab_size=type_vocab_size))
+ token_type_ids = tf.stack(token_type_ids, axis=2)
+
+ sequence_labels = None
+ token_labels = None
+ labels = None
+ numeric_values = None
+ numeric_values_scale = None
+ float_answer = None
+ aggregation_labels = None
+ if self.use_labels:
+ sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size)
+ token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels)
+ labels = ids_tensor([self.batch_size, self.seq_length], vocab_size=2)
+ numeric_values = ids_tensor([self.batch_size, self.seq_length], vocab_size=2, dtype=tf.float32)
+ numeric_values_scale = ids_tensor([self.batch_size, self.seq_length], vocab_size=2, dtype=tf.float32)
+ float_answer = ids_tensor([self.batch_size], vocab_size=2, dtype=tf.float32)
+ aggregation_labels = ids_tensor([self.batch_size], self.num_aggregation_labels)
+
+ config = self.get_config()
+
+ return (
+ config,
+ input_ids,
+ input_mask,
+ token_type_ids,
+ sequence_labels,
+ token_labels,
+ labels,
+ numeric_values,
+ numeric_values_scale,
+ float_answer,
+ aggregation_labels,
+ )
+
+ def get_config(self):
+ return TapasConfig(
+ vocab_size=self.vocab_size,
+ hidden_size=self.hidden_size,
+ num_hidden_layers=self.num_hidden_layers,
+ num_attention_heads=self.num_attention_heads,
+ intermediate_size=self.intermediate_size,
+ hidden_act=self.hidden_act,
+ hidden_dropout_prob=self.hidden_dropout_prob,
+ attention_probs_dropout_prob=self.attention_probs_dropout_prob,
+ max_position_embeddings=self.max_position_embeddings,
+ type_vocab_sizes=self.type_vocab_sizes,
+ initializer_range=self.initializer_range,
+ positive_weight=self.positive_weight,
+ num_aggregation_labels=self.num_aggregation_labels,
+ num_labels=self.num_labels,
+ aggregation_loss_importance=self.aggregation_loss_importance,
+ use_answer_as_supervision=self.use_answer_as_supervision,
+ answer_loss_importance=self.answer_loss_importance,
+ use_normalized_answer_loss=self.use_normalized_answer_loss,
+ huber_loss_delta=self.huber_loss_delta,
+ temperature=self.temperature,
+ agg_temperature=self.agg_temperature,
+ use_gumbel_for_cells=self.use_gumbel_for_cells,
+ use_gumbel_for_agg=self.use_gumbel_for_agg,
+ average_approximation_function=self.average_approximation_function,
+ cell_selection_preference=self.cell_selection_preference,
+ answer_loss_cutoff=self.answer_loss_cutoff,
+ max_num_rows=self.max_num_rows,
+ max_num_columns=self.max_num_columns,
+ average_logits_per_cell=self.average_logits_per_cell,
+ select_one_column=self.select_one_column,
+ allow_empty_column_selection=self.allow_empty_column_selection,
+ init_cell_selection_weights_to_zero=self.init_cell_selection_weights_to_zero,
+ reset_position_index_per_cell=self.reset_position_index_per_cell,
+ disable_per_token_loss=self.disable_per_token_loss,
+ )
+
+ def create_and_check_model(
+ self,
+ config,
+ input_ids,
+ input_mask,
+ token_type_ids,
+ sequence_labels,
+ token_labels,
+ labels,
+ numeric_values,
+ numeric_values_scale,
+ float_answer,
+ aggregation_labels,
+ ):
+ model = TFTapasModel(config=config)
+
+ inputs = {
+ "input_ids": input_ids,
+ "attention_mask": input_mask,
+ "token_type_ids": token_type_ids,
+ }
+ result = model(inputs)
+ inputs.pop("attention_mask")
+ result = model(inputs)
+ inputs.pop("token_type_ids")
+ result = model(inputs)
+
+ self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size))
+ self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size))
+
+ def create_and_check_for_masked_lm(
+ self,
+ config,
+ input_ids,
+ input_mask,
+ token_type_ids,
+ sequence_labels,
+ token_labels,
+ labels,
+ numeric_values,
+ numeric_values_scale,
+ float_answer,
+ aggregation_labels,
+ ):
+ model = TFTapasForMaskedLM(config=config)
+ inputs = {
+ "input_ids": input_ids,
+ "attention_mask": input_mask,
+ "token_type_ids": token_type_ids,
+ "labels": token_labels,
+ }
+ result = model(inputs)
+ self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size))
+
+ def create_and_check_for_sequence_classification(
+ self,
+ config,
+ input_ids,
+ input_mask,
+ token_type_ids,
+ sequence_labels,
+ token_labels,
+ labels,
+ numeric_values,
+ numeric_values_scale,
+ float_answer,
+ aggregation_labels,
+ ):
+ config.num_labels = self.num_labels
+ model = TFTapasForSequenceClassification(config=config)
+ inputs = {
+ "input_ids": input_ids,
+ "attention_mask": input_mask,
+ "labels": sequence_labels,
+ }
+ result = model(inputs)
+ self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels))
+
+ def create_and_check_for_question_answering(
+ self,
+ config,
+ input_ids,
+ input_mask,
+ token_type_ids,
+ sequence_labels,
+ token_labels,
+ labels,
+ numeric_values,
+ numeric_values_scale,
+ float_answer,
+ aggregation_labels,
+ ):
+ # inference: without aggregation head (SQA). Model only returns logits
+ sqa_config = copy.copy(config)
+ sqa_config.num_aggregation_labels = 0
+ sqa_config.use_answer_as_supervision = False
+ model = TFTapasForQuestionAnswering(config=sqa_config)
+ inputs = {
+ "input_ids": input_ids,
+ "attention_mask": input_mask,
+ "token_type_ids": token_type_ids,
+ }
+
+ result = model(inputs)
+ self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length))
+
+ # inference: with aggregation head (WTQ, WikiSQL-supervised). Model returns logits and aggregation logits
+ model = TFTapasForQuestionAnswering(config=config)
+ inputs = {
+ "input_ids": input_ids,
+ "attention_mask": input_mask,
+ "token_type_ids": token_type_ids,
+ }
+ result = model(inputs)
+ self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length))
+ self.parent.assertEqual(result.logits_aggregation.shape, (self.batch_size, self.num_aggregation_labels))
+
+ # training: can happen in 3 main ways
+ # case 1: conversational (SQA)
+ model = TFTapasForQuestionAnswering(config=sqa_config)
+ inputs = {
+ "input_ids": input_ids,
+ "attention_mask": input_mask,
+ "token_type_ids": token_type_ids,
+ "labels": labels,
+ }
+ result = model(inputs)
+ self.parent.assertEqual(result.loss.shape, ())
+ self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length))
+
+ # case 2: weak supervision for aggregation (WTQ)
+ model = TFTapasForQuestionAnswering(config=config)
+ inputs = {
+ "input_ids": input_ids,
+ "attention_mask": input_mask,
+ "token_type_ids": token_type_ids,
+ "labels": labels,
+ "numeric_values": numeric_values,
+ "numeric_values_scale": numeric_values_scale,
+ "float_answer": float_answer,
+ }
+ result = model(inputs)
+ self.parent.assertEqual(result.loss.shape, ())
+ self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length))
+ self.parent.assertEqual(result.logits_aggregation.shape, (self.batch_size, self.num_aggregation_labels))
+
+ # case 3: strong supervision for aggregation (WikiSQL-supervised)
+ wikisql_config = copy.copy(config)
+ wikisql_config.use_answer_as_supervision = False
+ model = TFTapasForQuestionAnswering(config=wikisql_config)
+ inputs = {
+ "input_ids": input_ids,
+ "attention_mask": input_mask,
+ "token_type_ids": token_type_ids,
+ "labels": labels,
+ "aggregation_labels": aggregation_labels,
+ }
+ result = model(inputs)
+ self.parent.assertEqual(result.loss.shape, ())
+ self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length))
+ self.parent.assertEqual(result.logits_aggregation.shape, (self.batch_size, self.num_aggregation_labels))
+
+ def prepare_config_and_inputs_for_common(self):
+ config_and_inputs = self.prepare_config_and_inputs()
+ (
+ config,
+ input_ids,
+ input_mask,
+ token_type_ids,
+ sequence_labels,
+ token_labels,
+ labels,
+ numeric_values,
+ numeric_values_scale,
+ float_answer,
+ aggregation_labels,
+ ) = config_and_inputs
+ inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask}
+ return config, inputs_dict
+
+
+@require_tensorflow_probability
+@require_tf
+class TFTapasModelTest(TFModelTesterMixin, unittest.TestCase):
+
+ all_model_classes = (
+ (
+ TFTapasModel,
+ TFTapasForMaskedLM,
+ TFTapasForSequenceClassification,
+ TFTapasForQuestionAnswering,
+ )
+ if is_tf_available()
+ else ()
+ )
+ test_head_masking = False
+ test_onnx = False
+
+ def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> dict:
+ inputs_dict = copy.deepcopy(inputs_dict)
+
+ if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
+ inputs_dict = {
+ k: tf.tile(tf.expand_dims(v, 1), (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1))
+ if isinstance(v, tf.Tensor) and v.ndim > 0
+ else v
+ for k, v in inputs_dict.items()
+ }
+
+ if return_labels:
+ if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING):
+ inputs_dict["labels"] = tf.ones(self.model_tester.batch_size, dtype=tf.int32)
+ elif model_class in get_values(TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING):
+ inputs_dict["labels"] = tf.zeros(
+ (self.model_tester.batch_size, self.model_tester.seq_length), dtype=tf.int32
+ )
+ inputs_dict["aggregation_labels"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
+ inputs_dict["numeric_values"] = tf.zeros(
+ (self.model_tester.batch_size, self.model_tester.seq_length), dtype=tf.float32
+ )
+ inputs_dict["numeric_values_scale"] = tf.zeros(
+ (self.model_tester.batch_size, self.model_tester.seq_length), dtype=tf.float32
+ )
+ inputs_dict["float_answer"] = tf.zeros(self.model_tester.batch_size, dtype=tf.float32)
+ elif model_class in get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING):
+ inputs_dict["labels"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
+ elif model_class in get_values(TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING):
+ inputs_dict["next_sentence_label"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32)
+ elif model_class in [
+ *get_values(TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING),
+ *get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING),
+ *get_values(TF_MODEL_FOR_MASKED_LM_MAPPING),
+ *get_values(TF_MODEL_FOR_PRETRAINING_MAPPING),
+ *get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING),
+ ]:
+ inputs_dict["labels"] = tf.zeros(
+ (self.model_tester.batch_size, self.model_tester.seq_length), dtype=tf.int32
+ )
+ return inputs_dict
+
+ def setUp(self):
+ self.model_tester = TFTapasModelTester(self)
+ self.config_tester = ConfigTester(self, config_class=TapasConfig, hidden_size=37)
+
+ def test_config(self):
+ self.config_tester.run_common_tests()
+
+ def test_model(self):
+ config_and_inputs = self.model_tester.prepare_config_and_inputs()
+ self.model_tester.create_and_check_model(*config_and_inputs)
+
+ def test_for_masked_lm(self):
+ config_and_inputs = self.model_tester.prepare_config_and_inputs()
+ self.model_tester.create_and_check_for_masked_lm(*config_and_inputs)
+
+ def test_for_question_answering(self):
+ config_and_inputs = self.model_tester.prepare_config_and_inputs()
+ self.model_tester.create_and_check_for_question_answering(*config_and_inputs)
+
+ def test_for_sequence_classification(self):
+ config_and_inputs = self.model_tester.prepare_config_and_inputs()
+ self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs)
+
+
+def prepare_tapas_single_inputs_for_inference():
+ # Here we prepare a single table-question pair to test TAPAS inference on:
+ data = {
+ "Footballer": ["Lionel Messi", "Cristiano Ronaldo"],
+ "Age": ["33", "35"],
+ }
+ queries = "Which footballer is 33 years old?"
+ table = pd.DataFrame.from_dict(data)
+
+ return table, queries
+
+
+def prepare_tapas_batch_inputs_for_inference():
+ # Here we prepare a batch of 2 table-question pairs to test TAPAS inference on:
+ data = {
+ "Footballer": ["Lionel Messi", "Cristiano Ronaldo"],
+ "Age": ["33", "35"],
+ "Number of goals": ["712", "750"],
+ }
+ queries = ["Which footballer is 33 years old?", "How many goals does Ronaldo have?"]
+ table = pd.DataFrame.from_dict(data)
+
+ return table, queries
+
+
+def prepare_tapas_batch_inputs_for_training():
+ # Here we prepare a DIFFERENT batch of 2 table-question pairs to test TAPAS training on:
+ data = {
+ "Footballer": ["Lionel Messi", "Cristiano Ronaldo"],
+ "Age": ["33", "35"],
+ "Number of goals": ["712", "750"],
+ }
+ queries = ["Which footballer is 33 years old?", "What's the total number of goals?"]
+ table = pd.DataFrame.from_dict(data)
+
+ answer_coordinates = [[(0, 0)], [(0, 2), (1, 2)]]
+ answer_text = [["Lionel Messi"], ["1462"]]
+ float_answer = [float("NaN"), float("1462")]
+
+ return table, queries, answer_coordinates, answer_text, float_answer
+
+
+@require_tensorflow_probability
+@require_tf
+class TFTapasModelIntegrationTest(unittest.TestCase):
+ @cached_property
+ def default_tokenizer(self):
+ return TapasTokenizer.from_pretrained("google/tapas-base-finetuned-wtq")
+
+ @slow
+ def test_inference_no_head(self):
+ # ideally we want to test this with the weights of tapas_inter_masklm_base_reset,
+ # but since it's not straightforward to do this with the TF 1 implementation, we test it with
+ # the weights of the WTQ base model (i.e. tapas_wtq_wikisql_sqa_inter_masklm_base_reset)
+ model = TFTapasModel.from_pretrained("google/tapas-base-finetuned-wtq")
+ tokenizer = self.default_tokenizer
+ table, queries = prepare_tapas_single_inputs_for_inference()
+ inputs = tokenizer(table=table, queries=queries, return_tensors="tf")
+ outputs = model(**inputs)
+
+ # test the sequence output
+ expected_slice = tf.constant(
+ [
+ [
+ [-0.141581565, -0.599805772, 0.747186482],
+ [-0.143664181, -0.602008104, 0.749218345],
+ [-0.15169853, -0.603363097, 0.741370678],
+ ]
+ ]
+ )
+ tf.debugging.assert_near(outputs.last_hidden_state[:, :3, :3], expected_slice, atol=0.0005)
+
+ # test the pooled output
+ expected_slice = tf.constant([[0.987518311, -0.970520139, -0.994303405]])
+
+ tf.debugging.assert_near(outputs.pooler_output[:, :3], expected_slice, atol=0.0005)
+
+ @unittest.skip(reason="Model not available yet")
+ def test_inference_masked_lm(self):
+ pass
+
+ # TapasForQuestionAnswering has 3 possible ways of being fine-tuned:
+ # - conversational set-up (SQA)
+ # - weak supervision for aggregation (WTQ, WikiSQL)
+ # - strong supervision for aggregation (WikiSQL-supervised)
+ # We test all of them:
+ @slow
+ def test_inference_question_answering_head_conversational(self):
+ # note that google/tapas-base-finetuned-sqa should correspond to tapas_sqa_inter_masklm_base_reset
+ model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-sqa")
+ tokenizer = self.default_tokenizer
+ table, queries = prepare_tapas_single_inputs_for_inference()
+ inputs = tokenizer(table=table, queries=queries, return_tensors="tf")
+ outputs = model(**inputs)
+
+ # test the logits
+ logits = outputs.logits
+ expected_shape = tf.TensorShape([1, 21])
+ tf.debugging.assert_equal(logits.shape, expected_shape)
+
+ expected_slice = tf.constant(
+ [
+ [
+ -9997.274,
+ -9997.274,
+ -9997.274,
+ -9997.274,
+ -9997.274,
+ -9997.274,
+ -9997.274,
+ -9997.274,
+ -9997.274,
+ -16.262585,
+ -10004.089,
+ 15.435196,
+ 15.435196,
+ 15.435196,
+ -9990.443,
+ -16.327433,
+ -16.327433,
+ -16.327433,
+ -16.327433,
+ -16.327433,
+ -10004.84,
+ ]
+ ]
+ )
+
+ tf.debugging.assert_near(logits, expected_slice, atol=0.015)
+
+ @slow
+ def test_inference_question_answering_head_conversational_absolute_embeddings(self):
+ # note that google/tapas-small-finetuned-sqa should correspond to tapas_sqa_inter_masklm_small_reset
+ # however here we test the version with absolute position embeddings
+ model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-small-finetuned-sqa")
+ tokenizer = self.default_tokenizer
+ table, queries = prepare_tapas_single_inputs_for_inference()
+ inputs = tokenizer(table=table, queries=queries, return_tensors="tf")
+ outputs = model(**inputs)
+
+ # test the logits
+ logits = outputs.logits
+ expected_shape = tf.TensorShape([1, 21])
+ tf.debugging.assert_equal(logits.shape, expected_shape)
+
+ expected_slice = tf.constant(
+ [
+ [
+ -10000.041,
+ -10000.041,
+ -10000.041,
+ -10000.041,
+ -10000.041,
+ -10000.041,
+ -10000.041,
+ -10000.041,
+ -10000.041,
+ -18.369339,
+ -10014.692,
+ 17.730324,
+ 17.730324,
+ 17.730324,
+ -9984.974,
+ -18.322773,
+ -18.322773,
+ -18.322773,
+ -18.322773,
+ -18.322773,
+ -10007.267,
+ ]
+ ]
+ )
+
+ tf.debugging.assert_near(logits, expected_slice, atol=0.01)
+
+ @slow
+ def test_inference_question_answering_head_weak_supervision(self):
+ # note that google/tapas-base-finetuned-wtq should correspond to tapas_wtq_wikisql_sqa_inter_masklm_base_reset
+ model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-wtq")
+
+ tokenizer = self.default_tokenizer
+ # let's test on a batch
+ table, queries = prepare_tapas_batch_inputs_for_inference()
+ inputs = tokenizer(table=table, queries=queries, padding="longest", return_tensors="tf")
+ outputs = model(**inputs)
+
+ # test the logits
+ logits = outputs.logits
+ expected_shape = tf.TensorShape([2, 28])
+ tf.debugging.assert_equal(logits.shape, expected_shape)
+
+ expected_slice = tf.constant(
+ [
+ [-160.375504, -160.375504, -160.375504, -10072.3965, -10070.9414, -10094.9736],
+ [-9861.6123, -9861.6123, -9861.6123, -9861.6123, -9891.01172, 146.600677],
+ ]
+ )
+
+ tf.debugging.assert_near(logits[:, -6:], expected_slice, atol=0.4)
+
+ # test the aggregation logits
+ logits_aggregation = outputs.logits_aggregation
+ expected_shape = tf.TensorShape([2, 4])
+ tf.debugging.assert_equal(logits_aggregation.shape, expected_shape)
+ expected_tensor = tf.constant(
+ [[18.8545208, -9.76614857, -6.3128891, -2.93525243], [-4.05782509, 40.0351, -5.35329962, 23.3978653]]
+ )
+ tf.debugging.assert_near(logits_aggregation, expected_tensor, atol=0.001)
+
+ # test the predicted answer coordinates and aggregation indices
+ EXPECTED_PREDICTED_ANSWER_COORDINATES = [[(0, 0)], [(1, 2)]]
+ EXPECTED_PREDICTED_AGGREGATION_INDICES = [0, 1]
+
+ predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions(
+ inputs, outputs.logits, outputs.logits_aggregation
+ )
+ tf.debugging.assert_equal(EXPECTED_PREDICTED_ANSWER_COORDINATES, predicted_answer_coordinates)
+ tf.debugging.assert_equal(EXPECTED_PREDICTED_AGGREGATION_INDICES, predicted_aggregation_indices)
+
+ @slow
+ def test_training_question_answering_head_weak_supervision(self):
+ # note that google/tapas-base-finetuned-wtq should correspond to tapas_wtq_wikisql_sqa_inter_masklm_base_reset
+ model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-wtq")
+ tokenizer = self.default_tokenizer
+ # let's test on a batch
+ table, queries, answer_coordinates, answer_text, float_answer = prepare_tapas_batch_inputs_for_training()
+ inputs = tokenizer(
+ table=table,
+ queries=queries,
+ answer_coordinates=answer_coordinates,
+ answer_text=answer_text,
+ padding="longest",
+ return_tensors="tf",
+ )
+ # the answer should be prepared by the user
+ float_answer = tf.constant(float_answer, dtype=tf.float32)
+ outputs = model(
+ input_ids=inputs["input_ids"],
+ attention_mask=inputs["attention_mask"],
+ token_type_ids=inputs["token_type_ids"],
+ labels=inputs["labels"],
+ numeric_values=inputs["numeric_values"],
+ numeric_values_scale=inputs["numeric_values_scale"],
+ float_answer=float_answer,
+ )
+
+ # test the loss
+ loss = outputs.loss
+ expected_loss = tf.constant(3.3527612686157227e-08)
+ tf.debugging.assert_near(loss, expected_loss, atol=1e-6)
+
+ # test the logits on the first example
+ logits = outputs.logits
+ expected_shape = tf.TensorShape([2, 29])
+ tf.debugging.assert_equal(logits.shape, expected_shape)
+ expected_slice = tf.constant(
+ [
+ -160.0156,
+ -160.0156,
+ -160.0156,
+ -160.0156,
+ -160.0156,
+ -10072.2266,
+ -10070.8896,
+ -10092.6006,
+ -10092.6006,
+ ]
+ )
+ tf.debugging.assert_near(logits[0, -9:], expected_slice, atol=1e-6)
+
+ # test the aggregation logits on the second example
+ logits_aggregation = outputs.logits_aggregation
+ expected_shape = tf.TensorShape([2, 4])
+ tf.debugging.assert_equal(logits_aggregation.shape, expected_shape)
+ expected_tensor = tf.constant([-4.0538, 40.0304, -5.3554, 23.3965])
+ tf.debugging.assert_near(logits_aggregation[1, -4:], expected_tensor, atol=1e-4)
+
+ @slow
+ def test_inference_question_answering_head_strong_supervision(self):
+ # note that google/tapas-base-finetuned-wikisql-supervised should correspond to tapas_wikisql_sqa_inter_masklm_base_reset
+ model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-wikisql-supervised")
+ tokenizer = self.default_tokenizer
+
+ table, queries = prepare_tapas_single_inputs_for_inference()
+ inputs = tokenizer(table=table, queries=queries, return_tensors="tf")
+ outputs = model(**inputs)
+
+ # test the logits
+ logits = outputs.logits
+ expected_shape = tf.TensorShape([1, 21])
+ tf.debugging.assert_equal(logits.shape, expected_shape)
+ expected_slice = tf.constant(
+ [
+ [
+ -10011.1084,
+ -10011.1084,
+ -10011.1084,
+ -10011.1084,
+ -10011.1084,
+ -10011.1084,
+ -10011.1084,
+ -10011.1084,
+ -10011.1084,
+ -18.6185989,
+ -10008.7969,
+ 17.6355762,
+ 17.6355762,
+ 17.6355762,
+ -10002.4404,
+ -18.7111301,
+ -18.7111301,
+ -18.7111301,
+ -18.7111301,
+ -18.7111301,
+ -10007.0977,
+ ]
+ ]
+ )
+ tf.debugging.assert_near(logits, expected_slice, atol=0.02)
+
+ # test the aggregation logits
+ logits_aggregation = outputs.logits_aggregation
+ expected_shape = tf.TensorShape([1, 4])
+ tf.debugging.assert_equal(logits_aggregation.shape, expected_shape)
+ expected_tensor = tf.constant([[16.5659733, -3.06624889, -2.34152961, -0.970244825]])
+ tf.debugging.assert_near(logits_aggregation, expected_tensor, atol=0.003)
+
+ @slow
+ def test_inference_classification_head(self):
+ # note that google/tapas-base-finetuned-tabfact should correspond to tapas_tabfact_inter_masklm_base_reset
+ model = TFTapasForSequenceClassification.from_pretrained("google/tapas-base-finetuned-tabfact")
+ tokenizer = self.default_tokenizer
+
+ table, queries = prepare_tapas_single_inputs_for_inference()
+ inputs = tokenizer(table=table, queries=queries, return_tensors="tf")
+ outputs = model(**inputs)
+
+ # test the classification logits
+ logits = outputs.logits
+ expected_shape = tf.TensorShape([1, 2])
+ tf.debugging.assert_equal(logits.shape, expected_shape)
+ expected_slice = tf.constant([[0.795137286, 9.5572]])
+ tf.debugging.assert_near(logits, expected_slice, atol=0.05)
+
+
+# Below: tests for Tapas utilities which are defined in modeling_tf_tapas.py.
+# These are based on segmented_tensor_test.py of the original implementation.
+# URL: https://github.com/google-research/tapas/blob/master/tapas/models/segmented_tensor_test.py
+@require_tensorflow_probability
+class TFTapasUtilsTest(unittest.TestCase):
+ def _prepare_tables(self):
+ """Prepares two tables, both with three distinct rows.
+ The first table has two columns:
+ 1.0, 2.0 | 3.0
+ 2.0, 0.0 | 1.0
+ 1.0, 3.0 | 4.0
+ The second table has three columns:
+ 1.0 | 2.0 | 3.0
+ 2.0 | 0.0 | 1.0
+ 1.0 | 3.0 | 4.0
+ Returns:
+ SegmentedTensors with the tables.
+ """
+ values = tf.constant(
+ [
+ [[1.0, 2.0, 3.0], [2.0, 0.0, 1.0], [1.0, 3.0, 4.0]],
+ [[1.0, 2.0, 3.0], [2.0, 0.0, 1.0], [1.0, 3.0, 4.0]],
+ ]
+ )
+ row_index = IndexMap(
+ indices=[
+ [[0, 0, 0], [1, 1, 1], [2, 2, 2]],
+ [[0, 0, 0], [1, 1, 1], [2, 2, 2]],
+ ],
+ num_segments=3,
+ batch_dims=1,
+ )
+ col_index = IndexMap(
+ indices=[
+ [[0, 0, 1], [0, 0, 1], [0, 0, 1]],
+ [[0, 1, 2], [0, 1, 2], [0, 1, 2]],
+ ],
+ num_segments=3,
+ batch_dims=1,
+ )
+ return values, row_index, col_index
+
+ def test_product_index(self):
+ _, row_index, col_index = self._prepare_tables()
+ cell_index = ProductIndexMap(row_index, col_index)
+ row_index_proj = cell_index.project_outer(cell_index)
+ col_index_proj = cell_index.project_inner(cell_index)
+
+ ind = cell_index.indices
+ self.assertEqual(cell_index.num_segments, 9)
+
+ # Projections should give back the original indices.
+ # we use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual
+ np.testing.assert_array_equal(row_index.indices.numpy(), row_index_proj.indices.numpy())
+ self.assertEqual(row_index.num_segments, row_index_proj.num_segments)
+ self.assertEqual(row_index.batch_dims, row_index_proj.batch_dims)
+ # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual
+ np.testing.assert_array_equal(col_index.indices.numpy(), col_index_proj.indices.numpy())
+ self.assertEqual(col_index.batch_dims, col_index_proj.batch_dims)
+
+ # The first and second "column" are identified in the first table.
+ for i in range(3):
+ self.assertEqual(ind[0, i, 0], ind[0, i, 1])
+ self.assertNotEqual(ind[0, i, 0], ind[0, i, 2])
+
+ # All rows are distinct in the first table.
+ for i, i_2 in zip(range(3), range(3)):
+ for j, j_2 in zip(range(3), range(3)):
+ if i != i_2 and j != j_2:
+ self.assertNotEqual(ind[0, i, j], ind[0, i_2, j_2])
+
+ # All cells are distinct in the second table.
+ for i, i_2 in zip(range(3), range(3)):
+ for j, j_2 in zip(range(3), range(3)):
+ if i != i_2 or j != j_2:
+ self.assertNotEqual(ind[1, i, j], ind[1, i_2, j_2])
+
+ def test_flatten(self):
+ _, row_index, col_index = self._prepare_tables()
+ row_index_flat = flatten(row_index)
+ col_index_flat = flatten(col_index)
+
+ shape = [3, 4, 5]
+ batched_index = IndexMap(indices=tf.zeros(shape, dtype=tf.int32), num_segments=1, batch_dims=3)
+ batched_index_flat = flatten(batched_index)
+
+ # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual
+ np.testing.assert_array_equal(
+ row_index_flat.indices.numpy(), [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5]
+ )
+ np.testing.assert_array_equal(
+ col_index_flat.indices.numpy(), [0, 0, 1, 0, 0, 1, 0, 0, 1, 3, 4, 5, 3, 4, 5, 3, 4, 5]
+ )
+ self.assertEqual(batched_index_flat.num_segments.numpy(), np.prod(shape))
+ np.testing.assert_array_equal(batched_index_flat.indices.numpy(), range(np.prod(shape)))
+
+ def test_range_index_map(self):
+ batch_shape = [3, 4]
+ num_segments = 5
+ index = range_index_map(batch_shape, num_segments)
+
+ self.assertEqual(num_segments, index.num_segments)
+ self.assertEqual(2, index.batch_dims)
+ indices = index.indices
+ # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual
+ np.testing.assert_array_equal(list(indices.shape), [3, 4, 5])
+ for i in range(batch_shape[0]):
+ for j in range(batch_shape[1]):
+ # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual
+ np.testing.assert_array_equal(indices[i, j, :].numpy(), range(num_segments))
+
+ def test_reduce_sum(self):
+ values, row_index, col_index = self._prepare_tables()
+ cell_index = ProductIndexMap(row_index, col_index)
+ row_sum, _ = reduce_sum(values, row_index)
+ col_sum, _ = reduce_sum(values, col_index)
+ cell_sum, _ = reduce_sum(values, cell_index)
+
+ # We use np.testing.assert_allclose rather than Tensorflow's assertAllClose
+ np.testing.assert_allclose(row_sum.numpy(), [[6.0, 3.0, 8.0], [6.0, 3.0, 8.0]])
+ np.testing.assert_allclose(col_sum.numpy(), [[9.0, 8.0, 0.0], [4.0, 5.0, 8.0]])
+ np.testing.assert_allclose(
+ cell_sum.numpy(),
+ [[3.0, 3.0, 0.0, 2.0, 1.0, 0.0, 4.0, 4.0, 0.0], [1.0, 2.0, 3.0, 2.0, 0.0, 1.0, 1.0, 3.0, 4.0]],
+ )
+
+ def test_reduce_mean(self):
+ values, row_index, col_index = self._prepare_tables()
+ cell_index = ProductIndexMap(row_index, col_index)
+ row_mean, _ = reduce_mean(values, row_index)
+ col_mean, _ = reduce_mean(values, col_index)
+ cell_mean, _ = reduce_mean(values, cell_index)
+
+ # We use np.testing.assert_allclose rather than Tensorflow's assertAllClose
+ np.testing.assert_allclose(
+ row_mean.numpy(), [[6.0 / 3.0, 3.0 / 3.0, 8.0 / 3.0], [6.0 / 3.0, 3.0 / 3.0, 8.0 / 3.0]]
+ )
+ np.testing.assert_allclose(col_mean.numpy(), [[9.0 / 6.0, 8.0 / 3.0, 0.0], [4.0 / 3.0, 5.0 / 3.0, 8.0 / 3.0]])
+ np.testing.assert_allclose(
+ cell_mean.numpy(),
+ [
+ [3.0 / 2.0, 3.0, 0.0, 2.0 / 2.0, 1.0, 0.0, 4.0 / 2.0, 4.0, 0.0],
+ [1.0, 2.0, 3.0, 2.0, 0.0, 1.0, 1.0, 3.0, 4.0],
+ ],
+ )
+
+ def test_reduce_max(self):
+ values = tf.convert_to_tensor([2.0, 1.0, 0.0, 3.0])
+ index = IndexMap(indices=tf.convert_to_tensor([0, 1, 0, 1]), num_segments=2)
+ maximum, _ = reduce_max(values, index)
+
+ # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual
+ np.testing.assert_array_equal(maximum.numpy(), [2, 3])
+
+ def test_reduce_sum_vectorized(self):
+ values = tf.convert_to_tensor([[1.0, 2.0, 3.0], [2.0, 3.0, 4.0], [3.0, 4.0, 5.0]])
+ index = IndexMap(indices=tf.convert_to_tensor([0, 0, 1]), num_segments=2, batch_dims=0)
+ sums, new_index = reduce_sum(values, index)
+
+ # We use np.testing.assert_allclose rather than Tensorflow's assertAllClose
+ np.testing.assert_allclose(sums.numpy(), [[3.0, 5.0, 7.0], [3.0, 4.0, 5.0]])
+ # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual
+ np.testing.assert_array_equal(new_index.indices.numpy(), [0, 1])
+ np.testing.assert_array_equal(new_index.num_segments.numpy(), 2)
+ np.testing.assert_array_equal(new_index.batch_dims, 0)
+
+ def test_gather(self):
+ values, row_index, col_index = self._prepare_tables()
+ cell_index = ProductIndexMap(row_index, col_index)
+
+ # Compute sums and then gather. The result should have the same shape as
+ # the original table and each element should contain the sum the values in
+ # its cell.
+ sums, _ = reduce_sum(values, cell_index)
+ cell_sum = gather(sums, cell_index)
+ assert cell_sum.shape == values.shape
+
+ # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual
+ np.testing.assert_allclose(
+ cell_sum.numpy(),
+ [[[3.0, 3.0, 3.0], [2.0, 2.0, 1.0], [4.0, 4.0, 4.0]], [[1.0, 2.0, 3.0], [2.0, 0.0, 1.0], [1.0, 3.0, 4.0]]],
+ )
+
+ def test_gather_vectorized(self):
+ values = tf.constant([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
+ index = IndexMap(indices=tf.convert_to_tensor([[0, 1], [1, 0]]), num_segments=2, batch_dims=1)
+ result = gather(values, index)
+
+ # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual
+ np.testing.assert_array_equal(result.numpy(), [[[1, 2], [3, 4]], [[7, 8], [5, 6]]])
diff --git a/tests/test_pipelines_table_question_answering.py b/tests/test_pipelines_table_question_answering.py
index a319387ee2..789e92c3d7 100644
--- a/tests/test_pipelines_table_question_answering.py
+++ b/tests/test_pipelines_table_question_answering.py
@@ -19,11 +19,13 @@ from transformers import (
AutoModelForTableQuestionAnswering,
AutoTokenizer,
TableQuestionAnsweringPipeline,
+ TFAutoModelForTableQuestionAnswering,
pipeline,
)
from transformers.testing_utils import (
is_pipeline_test,
require_pandas,
+ require_tensorflow_probability,
require_tf,
require_torch,
require_torch_scatter,
@@ -33,6 +35,7 @@ from transformers.testing_utils import (
from .test_pipelines_common import PipelineTestCaseMeta
+@require_tensorflow_probability
@require_torch_scatter
@require_torch
@require_pandas
@@ -43,9 +46,105 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
model_mapping = MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING
@require_tf
- @unittest.skip("Table question answering not implemented in TF")
def test_small_model_tf(self):
- pass
+ model_id = "lysandre/tiny-tapas-random-wtq"
+ model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id, from_pt=True)
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
+ self.assertIsInstance(model.config.aggregation_labels, dict)
+ self.assertIsInstance(model.config.no_aggregation_label_index, int)
+
+ table_querier = TableQuestionAnsweringPipeline(model=model, tokenizer=tokenizer)
+ outputs = table_querier(
+ table={
+ "actors": ["brad pitt", "leonardo di caprio", "george clooney"],
+ "age": ["56", "45", "59"],
+ "number of movies": ["87", "53", "69"],
+ "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
+ },
+ query="how many movies has george clooney played in?",
+ )
+ self.assertEqual(
+ outputs,
+ {"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
+ )
+ outputs = table_querier(
+ table={
+ "actors": ["brad pitt", "leonardo di caprio", "george clooney"],
+ "age": ["56", "45", "59"],
+ "number of movies": ["87", "53", "69"],
+ "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
+ },
+ query=["how many movies has george clooney played in?", "how old is he?", "what's his date of birth?"],
+ )
+ self.assertEqual(
+ outputs,
+ [
+ {"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
+ {"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
+ {"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
+ ],
+ )
+ outputs = table_querier(
+ table={
+ "Repository": ["Transformers", "Datasets", "Tokenizers"],
+ "Stars": ["36542", "4512", "3934"],
+ "Contributors": ["651", "77", "34"],
+ "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
+ },
+ query=[
+ "What repository has the largest number of stars?",
+ "Given that the numbers of stars defines if a repository is active, what repository is the most active?",
+ "What is the number of repositories?",
+ "What is the average number of stars?",
+ "What is the total amount of stars?",
+ ],
+ )
+ self.assertEqual(
+ outputs,
+ [
+ {"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
+ {"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
+ {"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
+ {"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
+ {"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"},
+ ],
+ )
+
+ with self.assertRaises(ValueError):
+ table_querier(query="What does it do with empty context ?", table=None)
+ with self.assertRaises(ValueError):
+ table_querier(query="What does it do with empty context ?", table="")
+ with self.assertRaises(ValueError):
+ table_querier(query="What does it do with empty context ?", table={})
+ with self.assertRaises(ValueError):
+ table_querier(
+ table={
+ "Repository": ["Transformers", "Datasets", "Tokenizers"],
+ "Stars": ["36542", "4512", "3934"],
+ "Contributors": ["651", "77", "34"],
+ "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
+ }
+ )
+ with self.assertRaises(ValueError):
+ table_querier(
+ query="",
+ table={
+ "Repository": ["Transformers", "Datasets", "Tokenizers"],
+ "Stars": ["36542", "4512", "3934"],
+ "Contributors": ["651", "77", "34"],
+ "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
+ },
+ )
+ with self.assertRaises(ValueError):
+ table_querier(
+ query=None,
+ table={
+ "Repository": ["Transformers", "Datasets", "Tokenizers"],
+ "Stars": ["36542", "4512", "3934"],
+ "Contributors": ["651", "77", "34"],
+ "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
+ },
+ )
@require_torch
def test_small_model_pt(self):
@@ -148,7 +247,8 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
},
)
- def test_slow_tokenizer_sqa(self):
+ @require_torch
+ def test_slow_tokenizer_sqa_pt(self):
model_id = "lysandre/tiny-tapas-random-sqa"
model = AutoModelForTableQuestionAnswering.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)
@@ -265,8 +365,126 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
},
)
+ @require_tf
+ def test_slow_tokenizer_sqa_tf(self):
+ model_id = "lysandre/tiny-tapas-random-sqa"
+ model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id, from_pt=True)
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
+ table_querier = TableQuestionAnsweringPipeline(model=model, tokenizer=tokenizer)
+
+ inputs = {
+ "table": {
+ "actors": ["brad pitt", "leonardo di caprio", "george clooney"],
+ "age": ["56", "45", "59"],
+ "number of movies": ["87", "53", "69"],
+ "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
+ },
+ "query": ["how many movies has george clooney played in?", "how old is he?", "what's his date of birth?"],
+ }
+ sequential_outputs = table_querier(**inputs, sequential=True)
+ batch_outputs = table_querier(**inputs, sequential=False)
+
+ self.assertEqual(len(sequential_outputs), 3)
+ self.assertEqual(len(batch_outputs), 3)
+ self.assertEqual(sequential_outputs[0], batch_outputs[0])
+ self.assertNotEqual(sequential_outputs[1], batch_outputs[1])
+ # self.assertNotEqual(sequential_outputs[2], batch_outputs[2])
+
+ table_querier = TableQuestionAnsweringPipeline(model=model, tokenizer=tokenizer)
+ outputs = table_querier(
+ table={
+ "actors": ["brad pitt", "leonardo di caprio", "george clooney"],
+ "age": ["56", "45", "59"],
+ "number of movies": ["87", "53", "69"],
+ "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
+ },
+ query="how many movies has george clooney played in?",
+ )
+ self.assertEqual(
+ outputs,
+ {"answer": "7 february 1967", "coordinates": [(0, 3)], "cells": ["7 february 1967"]},
+ )
+ outputs = table_querier(
+ table={
+ "actors": ["brad pitt", "leonardo di caprio", "george clooney"],
+ "age": ["56", "45", "59"],
+ "number of movies": ["87", "53", "69"],
+ "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
+ },
+ query=["how many movies has george clooney played in?", "how old is he?", "what's his date of birth?"],
+ )
+ self.assertEqual(
+ outputs,
+ [
+ {"answer": "7 february 1967", "coordinates": [(0, 3)], "cells": ["7 february 1967"]},
+ {"answer": "7 february 1967", "coordinates": [(0, 3)], "cells": ["7 february 1967"]},
+ {"answer": "7 february 1967", "coordinates": [(0, 3)], "cells": ["7 february 1967"]},
+ ],
+ )
+ outputs = table_querier(
+ table={
+ "Repository": ["Transformers", "Datasets", "Tokenizers"],
+ "Stars": ["36542", "4512", "3934"],
+ "Contributors": ["651", "77", "34"],
+ "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
+ },
+ query=[
+ "What repository has the largest number of stars?",
+ "Given that the numbers of stars defines if a repository is active, what repository is the most active?",
+ "What is the number of repositories?",
+ "What is the average number of stars?",
+ "What is the total amount of stars?",
+ ],
+ )
+ self.assertEqual(
+ outputs,
+ [
+ {"answer": "Python, Python", "coordinates": [(0, 3), (1, 3)], "cells": ["Python", "Python"]},
+ {"answer": "Python, Python", "coordinates": [(0, 3), (1, 3)], "cells": ["Python", "Python"]},
+ {"answer": "Python, Python", "coordinates": [(0, 3), (1, 3)], "cells": ["Python", "Python"]},
+ {"answer": "Python, Python", "coordinates": [(0, 3), (1, 3)], "cells": ["Python", "Python"]},
+ {"answer": "Python, Python", "coordinates": [(0, 3), (1, 3)], "cells": ["Python", "Python"]},
+ ],
+ )
+
+ with self.assertRaises(ValueError):
+ table_querier(query="What does it do with empty context ?", table=None)
+ with self.assertRaises(ValueError):
+ table_querier(query="What does it do with empty context ?", table="")
+ with self.assertRaises(ValueError):
+ table_querier(query="What does it do with empty context ?", table={})
+ with self.assertRaises(ValueError):
+ table_querier(
+ table={
+ "Repository": ["Transformers", "Datasets", "Tokenizers"],
+ "Stars": ["36542", "4512", "3934"],
+ "Contributors": ["651", "77", "34"],
+ "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
+ }
+ )
+ with self.assertRaises(ValueError):
+ table_querier(
+ query="",
+ table={
+ "Repository": ["Transformers", "Datasets", "Tokenizers"],
+ "Stars": ["36542", "4512", "3934"],
+ "Contributors": ["651", "77", "34"],
+ "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
+ },
+ )
+ with self.assertRaises(ValueError):
+ table_querier(
+ query=None,
+ table={
+ "Repository": ["Transformers", "Datasets", "Tokenizers"],
+ "Stars": ["36542", "4512", "3934"],
+ "Contributors": ["651", "77", "34"],
+ "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
+ },
+ )
+
@slow
- def test_integration_wtq(self):
+ def test_integration_wtq_pt(self):
table_querier = pipeline("table-question-answering")
data = {
@@ -310,7 +528,54 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
self.assertListEqual(results, expected_results)
@slow
- def test_integration_sqa(self):
+ def test_integration_wtq_tf(self):
+ model_id = "google/tapas-base-finetuned-wtq"
+ model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id)
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
+ table_querier = pipeline("table-question-answering", model=model, tokenizer=tokenizer)
+
+ data = {
+ "Repository": ["Transformers", "Datasets", "Tokenizers"],
+ "Stars": ["36542", "4512", "3934"],
+ "Contributors": ["651", "77", "34"],
+ "Programming language": ["Python", "Python", "Rust, Python and NodeJS"],
+ }
+ queries = [
+ "What repository has the largest number of stars?",
+ "Given that the numbers of stars defines if a repository is active, what repository is the most active?",
+ "What is the number of repositories?",
+ "What is the average number of stars?",
+ "What is the total amount of stars?",
+ ]
+
+ results = table_querier(data, queries)
+
+ expected_results = [
+ {"answer": "Transformers", "coordinates": [(0, 0)], "cells": ["Transformers"], "aggregator": "NONE"},
+ {"answer": "Transformers", "coordinates": [(0, 0)], "cells": ["Transformers"], "aggregator": "NONE"},
+ {
+ "answer": "COUNT > Transformers, Datasets, Tokenizers",
+ "coordinates": [(0, 0), (1, 0), (2, 0)],
+ "cells": ["Transformers", "Datasets", "Tokenizers"],
+ "aggregator": "COUNT",
+ },
+ {
+ "answer": "AVERAGE > 36542, 4512, 3934",
+ "coordinates": [(0, 1), (1, 1), (2, 1)],
+ "cells": ["36542", "4512", "3934"],
+ "aggregator": "AVERAGE",
+ },
+ {
+ "answer": "SUM > 36542, 4512, 3934",
+ "coordinates": [(0, 1), (1, 1), (2, 1)],
+ "cells": ["36542", "4512", "3934"],
+ "aggregator": "SUM",
+ },
+ ]
+ self.assertListEqual(results, expected_results)
+
+ @slow
+ def test_integration_sqa_pt(self):
table_querier = pipeline(
"table-question-answering",
model="google/tapas-base-finetuned-sqa",
@@ -331,3 +596,29 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta):
{"answer": "28 november 1967", "coordinates": [(2, 3)], "cells": ["28 november 1967"]},
]
self.assertListEqual(results, expected_results)
+
+ @slow
+ def test_integration_sqa_tf(self):
+ model_id = "google/tapas-base-finetuned-sqa"
+ model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id)
+ tokenizer = AutoTokenizer.from_pretrained(model_id)
+ table_querier = pipeline(
+ "table-question-answering",
+ model=model,
+ tokenizer=tokenizer,
+ )
+ data = {
+ "Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"],
+ "Age": ["56", "45", "59"],
+ "Number of movies": ["87", "53", "69"],
+ "Date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"],
+ }
+ queries = ["How many movies has George Clooney played in?", "How old is he?", "What's his date of birth?"]
+ results = table_querier(data, queries, sequential=True)
+
+ expected_results = [
+ {"answer": "69", "coordinates": [(2, 2)], "cells": ["69"]},
+ {"answer": "59", "coordinates": [(2, 1)], "cells": ["59"]},
+ {"answer": "28 november 1967", "coordinates": [(2, 3)], "cells": ["28 november 1967"]},
+ ]
+ self.assertListEqual(results, expected_results)