From c468a87a69754b865575f115a2c8efa024c82cb5 Mon Sep 17 00:00:00 2001 From: Kamal Raj Date: Tue, 30 Nov 2021 15:37:55 +0530 Subject: [PATCH] Tapas tf (#13393) * TF Tapas first commit * updated docs * updated logger message * updated pytorch weight conversion script to support scalar array * added use_cache to tapas model config to work properly with tf input_processing * 1. rm embeddings_sum 2. added # Copied 3. + TFTapasMLMHead 4. and lot other small fixes * updated docs * + test for tapas * updated testing_utils to check is_tensorflow_probability_available * converted model logits post processing using numpy to work with both PT and TF models * + TFAutoModelForTableQuestionAnswering * added TF support * added test for TFAutoModelForTableQuestionAnswering * added test for TFAutoModelForTableQuestionAnswering pipeline * updated auto model docs * fixed typo in import * added tensorflow_probability to run tests * updated MLM head * updated tapas.rst with TF model docs * fixed optimizer import in docs * updated convert to np data from pt model is not `transformers.tokenization_utils_base.BatchEncoding` after pipeline upgrade * updated pipeline: 1. with torch.no_gard removed, pipeline forward handles 2. token_type_ids converted to numpy * updated docs. * removed `use_cache` from config * removed floats_tensor * updated code comment * updated Copyright Year and logits_aggregation Optional * updated docs and comments * updated docstring * fixed model weight loading * make fixup * fix indentation * added tf slow pipeline test * pip upgrade * upgrade python to 3.7 * removed from_pt from tests * revert commit f18cfa9 --- .circleci/config.yml | 8 +- docs/source/index.rst | 2 +- docs/source/model_doc/auto.rst | 7 + docs/source/model_doc/tapas.rst | 231 +- src/transformers/__init__.py | 22 + src/transformers/file_utils.py | 19 + src/transformers/modeling_tf_pytorch_utils.py | 4 +- src/transformers/models/auto/__init__.py | 4 + .../models/auto/modeling_tf_auto.py | 27 + src/transformers/models/tapas/__init__.py | 22 +- .../models/tapas/modeling_tf_tapas.py | 2398 +++++++++++++++++ .../models/tapas/tokenization_tapas.py | 11 +- src/transformers/pipelines/__init__.py | 3 +- .../pipelines/table_question_answering.py | 182 +- src/transformers/testing_utils.py | 14 + src/transformers/utils/dummy_tf_objects.py | 78 + tests/test_modeling_tf_auto.py | 30 +- tests/test_modeling_tf_tapas.py | 1036 +++++++ ...test_pipelines_table_question_answering.py | 301 ++- 19 files changed, 4324 insertions(+), 75 deletions(-) create mode 100644 src/transformers/models/tapas/modeling_tf_tapas.py create mode 100644 tests/test_modeling_tf_tapas.py diff --git a/.circleci/config.yml b/.circleci/config.yml index d33e5b223b..cbcb3848e9 100644 --- a/.circleci/config.yml +++ b/.circleci/config.yml @@ -65,7 +65,7 @@ jobs: run_tests_torch_and_tf: working_directory: ~/transformers docker: - - image: circleci/python:3.6 + - image: circleci/python:3.7 environment: OMP_NUM_THREADS: 1 RUN_PT_TF_CROSS_TESTS: yes @@ -82,6 +82,7 @@ jobs: - run: pip install --upgrade pip - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision] - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html + - run: pip install tensorflow_probability - save_cache: key: v0.4-{{ checksum "setup.py" }} paths: @@ -118,6 +119,7 @@ jobs: - run: pip install --upgrade pip - run: pip install .[sklearn,tf-cpu,torch,testing,sentencepiece,torch-speech,vision] - run: pip install torch-scatter -f https://pytorch-geometric.com/whl/torch-1.10.0+cpu.html + - run: pip install tensorflow_probability - save_cache: key: v0.4-{{ checksum "setup.py" }} paths: @@ -278,6 +280,7 @@ jobs: - v0.4-{{ checksum "setup.py" }} - run: pip install --upgrade pip - run: pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision] + - run: pip install tensorflow_probability - save_cache: key: v0.4-tf-{{ checksum "setup.py" }} paths: @@ -311,6 +314,7 @@ jobs: - v0.4-{{ checksum "setup.py" }} - run: pip install --upgrade pip - run: pip install .[sklearn,tf-cpu,testing,sentencepiece,tf-speech,vision] + - run: pip install tensorflow_probability - save_cache: key: v0.4-tf-{{ checksum "setup.py" }} paths: @@ -468,6 +472,7 @@ jobs: - v0.4-{{ checksum "setup.py" }} - run: pip install --upgrade pip - run: pip install .[sklearn,tf-cpu,testing,sentencepiece] + - run: pip install tensorflow_probability - save_cache: key: v0.4-tf-{{ checksum "setup.py" }} paths: @@ -502,6 +507,7 @@ jobs: - v0.4-{{ checksum "setup.py" }} - run: pip install --upgrade pip - run: pip install .[sklearn,tf-cpu,testing,sentencepiece] + - run: pip install tensorflow_probability - save_cache: key: v0.4-tf-{{ checksum "setup.py" }} paths: diff --git a/docs/source/index.rst b/docs/source/index.rst index c2ce07b2a0..1c91216948 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -499,7 +499,7 @@ Flax), PyTorch, and/or TensorFlow. +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ | T5 | ✅ | ✅ | ✅ | ✅ | ✅ | +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ -| TAPAS | ✅ | ❌ | ✅ | ❌ | ❌ | +| TAPAS | ✅ | ❌ | ✅ | ✅ | ❌ | +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ | Transformer-XL | ✅ | ❌ | ✅ | ✅ | ❌ | +-----------------------------+----------------+----------------+-----------------+--------------------+--------------+ diff --git a/docs/source/model_doc/auto.rst b/docs/source/model_doc/auto.rst index ef4f158740..ad69cf4dc0 100644 --- a/docs/source/model_doc/auto.rst +++ b/docs/source/model_doc/auto.rst @@ -265,6 +265,13 @@ TFAutoModelForMultipleChoice :members: +TFAutoModelForTableQuestionAnswering +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.TFAutoModelForTableQuestionAnswering + :members: + + TFAutoModelForTokenClassification ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/source/model_doc/tapas.rst b/docs/source/model_doc/tapas.rst index d1cea3226a..face832935 100644 --- a/docs/source/model_doc/tapas.rst +++ b/docs/source/model_doc/tapas.rst @@ -49,7 +49,8 @@ entailment (a binary classification task). For more details, see their follow-up intermediate pre-training `__ by Julian Martin Eisenschlos, Syrine Krichene and Thomas Müller. -This model was contributed by `nielsr `__. The original code can be found `here +This model was contributed by `nielsr `__. The Tensorflow version of this model was +contributed by `kamalkraj `__. The original code can be found `here `__. Tips: @@ -130,6 +131,24 @@ for your environment): >>> config = TapasConfig('google-base-finetuned-wikisql-supervised') >>> model = TapasForQuestionAnswering.from_pretrained('google/tapas-base', config=config) +In TensorFlow, this can be done as follows (make sure to have installed the `tensorflow_probability dependency +__ for your environment): + +.. code-block:: + + >>> from transformers import TapasConfig, TFTapasForQuestionAnswering + + >>> # for example, the base sized model with default SQA configuration + >>> model = TFTapasForQuestionAnswering.from_pretrained('google/tapas-base') + + >>> # or, the base sized model with WTQ configuration + >>> config = TapasConfig.from_pretrained('google/tapas-base-finetuned-wtq') + >>> model = TFTapasForQuestionAnswering.from_pretrained('google/tapas-base', config=config) + + >>> # or, the base sized model with WikiSQL configuration + >>> config = TapasConfig('google-base-finetuned-wikisql-supervised') + >>> model = TFTapasForQuestionAnswering.from_pretrained('google/tapas-base', config=config) + Of course, you don't necessarily have to follow one of these three ways in which TAPAS was fine-tuned. You can also experiment by defining any hyperparameters you want when initializing :class:`~transformers.TapasConfig`, and then @@ -142,10 +161,21 @@ way. Here's an example: >>> from transformers import TapasConfig, TapasForQuestionAnswering >>> # you can initialize the classification heads any way you want (see docs of TapasConfig) - >>> config = TapasConfig(num_aggregation_labels=3, average_logits_per_cell=True, select_one_column=False) + >>> config = TapasConfig(num_aggregation_labels=3, average_logits_per_cell=True) >>> # initializing the pre-trained base sized model with our custom classification heads >>> model = TapasForQuestionAnswering.from_pretrained('google/tapas-base', config=config) +And here is the equivalent code for TensorFlow: + +.. code-block:: + + >>> from transformers import TapasConfig, TFTapasForQuestionAnswering + + >>> # you can initialize the classification heads any way you want (see docs of TapasConfig) + >>> config = TapasConfig(num_aggregation_labels=3, average_logits_per_cell=True) + >>> # initializing the pre-trained base sized model with our custom classification heads + >>> model = TFTapasForQuestionAnswering.from_pretrained('google/tapas-base', config=config) + What you can also do is start from an already fine-tuned checkpoint. A note here is that the already fine-tuned checkpoint on WTQ has some issues due to the L2-loss which is somewhat brittle. See `here `__ for more info. @@ -180,12 +210,13 @@ SQA format. The author explains this `here are not perfect (the ``answer_coordinates`` and ``float_answer`` fields are populated based on the ``answer_text``), meaning that WTQ and WikiSQL results could actually be improved. -**STEP 3: Convert your data into PyTorch tensors using TapasTokenizer** +**STEP 3: Convert your data into PyTorch/TensorFlow tensors using TapasTokenizer** Third, given that you've prepared your data in this TSV/CSV format (and corresponding CSV files containing the tabular data), you can then use :class:`~transformers.TapasTokenizer` to convert table-question pairs into :obj:`input_ids`, :obj:`attention_mask`, :obj:`token_type_ids` and so on. Again, based on which of the three cases you picked above, -:class:`~transformers.TapasForQuestionAnswering` requires different inputs to be fine-tuned: +:class:`~transformers.TapasForQuestionAnswering`/:class:`~transformers.TFTapasForQuestionAnswering` requires different +inputs to be fine-tuned: +------------------------------------+----------------------------------------------------------------------------------------------+ | **Task** | **Required inputs** | @@ -220,6 +251,8 @@ are already in the TSV file of step 2. Here's an example: {'input_ids': tensor([[ ... ]]), 'attention_mask': tensor([[...]]), 'token_type_ids': tensor([[[...]]]), 'numeric_values': tensor([[ ... ]]), 'numeric_values_scale: tensor([[ ... ]]), labels: tensor([[ ... ]])} +Set `return_tensors='tf'` when calling the tokenizer to prepare data for the TF models. + Note that :class:`~transformers.TapasTokenizer` expects the data of the table to be **text-only**. You can use ``.astype(str)`` on a dataframe to turn it into text-only data. Of course, this only shows how to encode a single training example. It is advised to create a PyTorch dataset and a corresponding dataloader: @@ -261,15 +294,67 @@ training example. It is advised to create a PyTorch dataset and a corresponding >>> train_dataset = TableDataset(data, tokenizer) >>> train_dataloader = torch.utils.data.DataLoader(train_dataset, batch_size=32) +And here is the equivalent code for TensorFlow: + +.. code-block:: + + >>> import tensorflow as tf + >>> import pandas as pd + + >>> tsv_path = "your_path_to_the_tsv_file" + >>> table_csv_path = "your_path_to_a_directory_containing_all_csv_files" + + >>> class TableDataset: + ... def __init__(self, data, tokenizer): + ... self.data = data + ... self.tokenizer = tokenizer + ... + ... def __iter__(self): + ... for idx in range(self.__len__()): + ... item = self.data.iloc[idx] + ... table = pd.read_csv(table_csv_path + item.table_file).astype(str) # be sure to make your table data text only + ... encoding = self.tokenizer(table=table, + ... queries=item.question, + ... answer_coordinates=item.answer_coordinates, + ... answer_text=item.answer_text, + ... truncation=True, + ... padding="max_length", + ... return_tensors="tf" + ... ) + ... # remove the batch dimension which the tokenizer adds by default + ... encoding = {key: tf.squeeze(val,0) for key, val in encoding.items()} + ... # add the float_answer which is also required (weak supervision for aggregation case) + ... encoding["float_answer"] = tf.convert_to_tensor(item.float_answer,dtype=tf.float32) + ... yield encoding['input_ids'], encoding['attention_mask'], encoding['numeric_values'], \ + ... encoding['numeric_values_scale'], encoding['token_type_ids'], encoding['labels'], \ + ... encoding['float_answer'] + ... + ... def __len__(self): + ... return len(self.data) + + >>> data = pd.read_csv(tsv_path, sep='\t') + >>> train_dataset = TableDataset(data, tokenizer) + >>> output_signature = ( + ... tf.TensorSpec(shape=(512,), dtype=tf.int32), + ... tf.TensorSpec(shape=(512,), dtype=tf.int32), + ... tf.TensorSpec(shape=(512,), dtype=tf.float32), + ... tf.TensorSpec(shape=(512,), dtype=tf.float32), + ... tf.TensorSpec(shape=(512,7), dtype=tf.int32), + ... tf.TensorSpec(shape=(512,), dtype=tf.int32), + ... tf.TensorSpec(shape=(512,), dtype=tf.float32)) + >>> train_dataloader = tf.data.Dataset.from_generator(train_dataset, output_signature=output_signature).batch(32) + Note that here, we encode each table-question pair independently. This is fine as long as your dataset is **not conversational**. In case your dataset involves conversational questions (such as in SQA), then you should first group together the ``queries``, ``answer_coordinates`` and ``answer_text`` per table (in the order of their ``position`` index) and batch encode each table with its questions. This will make sure that the ``prev_labels`` token types (see docs of :class:`~transformers.TapasTokenizer`) are set correctly. See `this notebook `__ -for more info. +for more info. See `this notebook +`__ +for more info regarding using the TensorFlow model. -**STEP 4: Train (fine-tune) TapasForQuestionAnswering** +**STEP 4: Train (fine-tune) TapasForQuestionAnswering/TFTapasForQuestionAnswering** You can then fine-tune :class:`~transformers.TapasForQuestionAnswering` using native PyTorch as follows (shown here for the weak supervision for aggregation case): @@ -316,6 +401,52 @@ the weak supervision for aggregation case): ... loss.backward() ... optimizer.step() + +Equivalently, fine-tuning :class:`~transformers.TFTapasForQuestionAnswering` in native TensorFlow can be done as +follows (shown here for the weak supervision for aggregation case): + +.. code-block:: + + >>> import tensorflow as tf + >>> from transformers import TapasConfig, TFTapasForQuestionAnswering + + >>> # this is the default WTQ configuration + >>> config = TapasConfig( + ... num_aggregation_labels = 4, + ... use_answer_as_supervision = True, + ... answer_loss_cutoff = 0.664694, + ... cell_selection_preference = 0.207951, + ... huber_loss_delta = 0.121194, + ... init_cell_selection_weights_to_zero = True, + ... select_one_column = True, + ... allow_empty_column_selection = False, + ... temperature = 0.0352513, + ... ) + >>> model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base", config=config) + + >>> optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5) + + >>> for epoch in range(2): # loop over the dataset multiple times + ... for idx, batch in enumerate(train_dataloader): + ... # get the inputs; + ... input_ids = batch[0] + ... attention_mask = batch[1] + ... token_type_ids = batch[4] + ... labels = batch[-1] + ... numeric_values = batch[2] + ... numeric_values_scale = batch[3] + ... float_answer = batch[6] + + ... # forward + backward + optimize + ... with tf.GradientTape() as tape: + ... outputs = model(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, + ... labels=labels, numeric_values=numeric_values, numeric_values_scale=numeric_values_scale, + ... float_answer=float_answer ) + ... grads = tape.gradient(outputs.loss, model.trainable_weights) + ... optimizer.apply_gradients(zip(grads, model.trainable_weights)) + + + Usage: inference ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -380,10 +511,68 @@ of that: What is the total number of movies? Predicted answer: SUM > 87, 53, 69 + +And here is the equivalent code for TensorFlow: + +.. code-block:: + + >>> from transformers import TapasTokenizer, TFTapasForQuestionAnswering + >>> import pandas as pd + + >>> model_name = 'google/tapas-base-finetuned-wtq' + >>> model = TFTapasForQuestionAnswering.from_pretrained(model_name) + >>> tokenizer = TapasTokenizer.from_pretrained(model_name) + + >>> data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], 'Number of movies': ["87", "53", "69"]} + >>> queries = ["What is the name of the first actor?", "How many movies has George Clooney played in?", "What is the total number of movies?"] + >>> table = pd.DataFrame.from_dict(data) + >>> inputs = tokenizer(table=table, queries=queries, padding='max_length', return_tensors="tf") + >>> outputs = model(**inputs) + >>> predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions( + ... inputs, + ... outputs.logits, + ... outputs.logits_aggregation + ... ) + + >>> # let's print out the results: + >>> id2aggregation = {0: "NONE", 1: "SUM", 2: "AVERAGE", 3:"COUNT"} + >>> aggregation_predictions_string = [id2aggregation[x] for x in predicted_aggregation_indices] + + >>> answers = [] + >>> for coordinates in predicted_answer_coordinates: + ... if len(coordinates) == 1: + ... # only a single cell: + ... answers.append(table.iat[coordinates[0]]) + ... else: + ... # multiple cells + ... cell_values = [] + ... for coordinate in coordinates: + ... cell_values.append(table.iat[coordinate]) + ... answers.append(", ".join(cell_values)) + + >>> display(table) + >>> print("") + >>> for query, answer, predicted_agg in zip(queries, answers, aggregation_predictions_string): + ... print(query) + ... if predicted_agg == "NONE": + ... print("Predicted answer: " + answer) + ... else: + ... print("Predicted answer: " + predicted_agg + " > " + answer) + What is the name of the first actor? + Predicted answer: Brad Pitt + How many movies has George Clooney played in? + Predicted answer: COUNT > 69 + What is the total number of movies? + Predicted answer: SUM > 87, 53, 69 + + In case of a conversational set-up, then each table-question pair must be provided **sequentially** to the model, such that the ``prev_labels`` token types can be overwritten by the predicted ``labels`` of the previous table-question pair. Again, more info can be found in `this notebook -`__. +`__ +(for PyTorch) and `this notebook +`__ +(for TensorFlow). Tapas specific outputs @@ -433,3 +622,31 @@ TapasForQuestionAnswering .. autoclass:: transformers.TapasForQuestionAnswering :members: forward + + +TFTapasModel +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.TFTapasModel + :members: call + + +TFTapasForMaskedLM +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.TFTapasForMaskedLM + :members: call + + +TFTapasForSequenceClassification +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.TFTapasForSequenceClassification + :members: call + + +TFTapasForQuestionAnswering +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +.. autoclass:: transformers.TFTapasForQuestionAnswering + :members: call diff --git a/src/transformers/__init__.py b/src/transformers/__init__.py index 5595bd1c9b..a6d0b2a2eb 100755 --- a/src/transformers/__init__.py +++ b/src/transformers/__init__.py @@ -1446,6 +1446,7 @@ if is_tf_available(): "TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING", "TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING", "TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING", + "TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING", "TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING", "TF_MODEL_MAPPING", "TF_MODEL_WITH_LM_HEAD_MAPPING", @@ -1458,6 +1459,7 @@ if is_tf_available(): "TFAutoModelForQuestionAnswering", "TFAutoModelForSeq2SeqLM", "TFAutoModelForSequenceClassification", + "TFAutoModelForTableQuestionAnswering", "TFAutoModelForTokenClassification", "TFAutoModelWithLMHead", ] @@ -1767,6 +1769,16 @@ if is_tf_available(): "TFT5PreTrainedModel", ] ) + _import_structure["models.tapas"].extend( + [ + "TF_TAPAS_PRETRAINED_MODEL_ARCHIVE_LIST", + "TFTapasForMaskedLM", + "TFTapasForQuestionAnswering", + "TFTapasForSequenceClassification", + "TFTapasModel", + "TFTapasPreTrainedModel", + ] + ) _import_structure["models.transfo_xl"].extend( [ "TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST", @@ -3225,6 +3237,7 @@ if TYPE_CHECKING: TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING, TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, + TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING, TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, TF_MODEL_MAPPING, TF_MODEL_WITH_LM_HEAD_MAPPING, @@ -3237,6 +3250,7 @@ if TYPE_CHECKING: TFAutoModelForQuestionAnswering, TFAutoModelForSeq2SeqLM, TFAutoModelForSequenceClassification, + TFAutoModelForTableQuestionAnswering, TFAutoModelForTokenClassification, TFAutoModelWithLMHead, ) @@ -3483,6 +3497,14 @@ if TYPE_CHECKING: TFT5Model, TFT5PreTrainedModel, ) + from .models.tapas import ( + TF_TAPAS_PRETRAINED_MODEL_ARCHIVE_LIST, + TFTapasForMaskedLM, + TFTapasForQuestionAnswering, + TFTapasForSequenceClassification, + TFTapasModel, + TFTapasPreTrainedModel, + ) from .models.transfo_xl import ( TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST, TFAdaptiveEmbedding, diff --git a/src/transformers/file_utils.py b/src/transformers/file_utils.py index 5d99b36c14..5294f3aab7 100644 --- a/src/transformers/file_utils.py +++ b/src/transformers/file_utils.py @@ -213,6 +213,14 @@ except importlib_metadata.PackageNotFoundError: _soundfile_available = False +_tensorflow_probability_available = importlib.util.find_spec("tensorflow_probability") is not None +try: + _tensorflow_probability_version = importlib_metadata.version("tensorflow_probability") + logger.debug(f"Successfully imported tensorflow-probability version {_tensorflow_probability_version}") +except importlib_metadata.PackageNotFoundError: + _tensorflow_probability_available = False + + _timm_available = importlib.util.find_spec("timm") is not None try: _timm_version = importlib_metadata.version("timm") @@ -444,6 +452,10 @@ def is_pytorch_quantization_available(): return _pytorch_quantization_available +def is_tensorflow_probability_available(): + return _tensorflow_probability_available + + def is_pandas_available(): return importlib.util.find_spec("pandas") is not None @@ -629,6 +641,12 @@ PYTORCH_QUANTIZATION_IMPORT_ERROR = """ `pip install pytorch-quantization --extra-index-url https://pypi.ngc.nvidia.com` """ +# docstyle-ignore +TENSORFLOW_PROBABILITY_IMPORT_ERROR = """ +{0} requires the tensorflow_probability library but it was not found in your environment. You can install it with pip as +explained here: https://github.com/tensorflow/probability. +""" + # docstyle-ignore PANDAS_IMPORT_ERROR = """ @@ -684,6 +702,7 @@ BACKENDS_MAPPING = OrderedDict( ("sentencepiece", (is_sentencepiece_available, SENTENCEPIECE_IMPORT_ERROR)), ("sklearn", (is_sklearn_available, SKLEARN_IMPORT_ERROR)), ("speech", (is_speech_available, SPEECH_IMPORT_ERROR)), + ("tensorflow_probability", (is_tensorflow_probability_available, TENSORFLOW_PROBABILITY_IMPORT_ERROR)), ("tf", (is_tf_available, TENSORFLOW_IMPORT_ERROR)), ("timm", (is_timm_available, TIMM_IMPORT_ERROR)), ("tokenizers", (is_tokenizers_available, TOKENIZERS_IMPORT_ERROR)), diff --git a/src/transformers/modeling_tf_pytorch_utils.py b/src/transformers/modeling_tf_pytorch_utils.py index 9ed2a0a1cd..037525979b 100644 --- a/src/transformers/modeling_tf_pytorch_utils.py +++ b/src/transformers/modeling_tf_pytorch_utils.py @@ -399,7 +399,9 @@ def load_tf2_weights_in_pytorch_model(pt_model, tf_weights, allow_missing_keys=F raise e # logger.warning(f"Initialize PyTorch weight {pt_weight_name}") - + # Make sure we have a proper numpy array + if numpy.isscalar(array): + array = numpy.array(array) new_pt_params_dict[pt_weight_name] = torch.from_numpy(array) loaded_pt_weights_data_ptr[pt_weight.data_ptr()] = torch.from_numpy(array) all_tf_weights.discard(pt_weight_name) diff --git a/src/transformers/models/auto/__init__.py b/src/transformers/models/auto/__init__.py index ba4ba2dd7f..bd6e3a369b 100644 --- a/src/transformers/models/auto/__init__.py +++ b/src/transformers/models/auto/__init__.py @@ -83,6 +83,7 @@ if is_tf_available(): "TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING", "TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING", "TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING", + "TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING", "TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING", "TF_MODEL_MAPPING", "TF_MODEL_WITH_LM_HEAD_MAPPING", @@ -95,6 +96,7 @@ if is_tf_available(): "TFAutoModelForQuestionAnswering", "TFAutoModelForSeq2SeqLM", "TFAutoModelForSequenceClassification", + "TFAutoModelForTableQuestionAnswering", "TFAutoModelForTokenClassification", "TFAutoModelWithLMHead", ] @@ -189,6 +191,7 @@ if TYPE_CHECKING: TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING, TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, + TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING, TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, TF_MODEL_MAPPING, TF_MODEL_WITH_LM_HEAD_MAPPING, @@ -201,6 +204,7 @@ if TYPE_CHECKING: TFAutoModelForQuestionAnswering, TFAutoModelForSeq2SeqLM, TFAutoModelForSequenceClassification, + TFAutoModelForTableQuestionAnswering, TFAutoModelForTokenClassification, TFAutoModelWithLMHead, ) diff --git a/src/transformers/models/auto/modeling_tf_auto.py b/src/transformers/models/auto/modeling_tf_auto.py index 0ff9eed872..d32abfdd8a 100644 --- a/src/transformers/models/auto/modeling_tf_auto.py +++ b/src/transformers/models/auto/modeling_tf_auto.py @@ -59,6 +59,7 @@ TF_MODEL_MAPPING_NAMES = OrderedDict( ("funnel", ("TFFunnelModel", "TFFunnelBaseModel")), ("dpr", "TFDPRQuestionEncoder"), ("mpnet", "TFMPNetModel"), + ("tapas", "TFTapasModel"), ("mbart", "TFMBartModel"), ("marian", "TFMarianModel"), ("pegasus", "TFPegasusModel"), @@ -92,6 +93,7 @@ TF_MODEL_FOR_PRETRAINING_MAPPING_NAMES = OrderedDict( ("xlm", "TFXLMWithLMHeadModel"), ("ctrl", "TFCTRLLMHeadModel"), ("electra", "TFElectraForPreTraining"), + ("tapas", "TFTapasForMaskedLM"), ("funnel", "TFFunnelForPreTraining"), ("mpnet", "TFMPNetForMaskedLM"), ] @@ -124,6 +126,7 @@ TF_MODEL_WITH_LM_HEAD_MAPPING_NAMES = OrderedDict( ("xlm", "TFXLMWithLMHeadModel"), ("ctrl", "TFCTRLLMHeadModel"), ("electra", "TFElectraForMaskedLM"), + ("tapas", "TFTapasForMaskedLM"), ("funnel", "TFFunnelForMaskedLM"), ("mpnet", "TFMPNetForMaskedLM"), ] @@ -172,6 +175,7 @@ TF_MODEL_FOR_MASKED_LM_MAPPING_NAMES = OrderedDict( ("flaubert", "TFFlaubertWithLMHeadModel"), ("xlm", "TFXLMWithLMHeadModel"), ("electra", "TFElectraForMaskedLM"), + ("tapas", "TFTapasForMaskedLM"), ("funnel", "TFFunnelForMaskedLM"), ("mpnet", "TFMPNetForMaskedLM"), ] @@ -215,6 +219,7 @@ TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING_NAMES = OrderedDict( ("flaubert", "TFFlaubertForSequenceClassification"), ("xlm", "TFXLMForSequenceClassification"), ("electra", "TFElectraForSequenceClassification"), + ("tapas", "TFTapasForSequenceClassification"), ("funnel", "TFFunnelForSequenceClassification"), ("gpt2", "TFGPT2ForSequenceClassification"), ("mpnet", "TFMPNetForSequenceClassification"), @@ -249,6 +254,14 @@ TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict( ] ) +TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES = OrderedDict( + [ + # Model for Table Question Answering mapping + ("tapas", "TFTapasForQuestionAnswering"), + ] +) + + TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES = OrderedDict( [ # Model for Token Classification mapping @@ -323,6 +336,9 @@ TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = _LazyAutoMapping( TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping( CONFIG_MAPPING_NAMES, TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING_NAMES ) +TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING = _LazyAutoMapping( + CONFIG_MAPPING_NAMES, TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING_NAMES +) TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = _LazyAutoMapping( CONFIG_MAPPING_NAMES, TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING_NAMES ) @@ -402,6 +418,17 @@ class TFAutoModelForQuestionAnswering(_BaseAutoModelClass): TFAutoModelForQuestionAnswering = auto_class_update(TFAutoModelForQuestionAnswering, head_doc="question answering") +class TFAutoModelForTableQuestionAnswering(_BaseAutoModelClass): + _model_mapping = TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING + + +TFAutoModelForTableQuestionAnswering = auto_class_update( + TFAutoModelForTableQuestionAnswering, + head_doc="table question answering", + checkpoint_for_example="google/tapas-base-finetuned-wtq", +) + + class TFAutoModelForTokenClassification(_BaseAutoModelClass): _model_mapping = TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING diff --git a/src/transformers/models/tapas/__init__.py b/src/transformers/models/tapas/__init__.py index 3949ed6fba..9f1d442b98 100644 --- a/src/transformers/models/tapas/__init__.py +++ b/src/transformers/models/tapas/__init__.py @@ -18,7 +18,7 @@ from typing import TYPE_CHECKING -from ...file_utils import _LazyModule, is_torch_available +from ...file_utils import _LazyModule, is_tf_available, is_torch_available _import_structure = { @@ -36,6 +36,15 @@ if is_torch_available(): "TapasPreTrainedModel", "load_tf_weights_in_tapas", ] +if is_tf_available(): + _import_structure["modeling_tf_tapas"] = [ + "TF_TAPAS_PRETRAINED_MODEL_ARCHIVE_LIST", + "TFTapasForMaskedLM", + "TFTapasForQuestionAnswering", + "TFTapasForSequenceClassification", + "TFTapasModel", + "TFTapasPreTrainedModel", + ] if TYPE_CHECKING: @@ -53,6 +62,17 @@ if TYPE_CHECKING: load_tf_weights_in_tapas, ) + if is_tf_available(): + from .modeling_tf_tapas import ( + TF_TAPAS_PRETRAINED_MODEL_ARCHIVE_LIST, + TFTapasForMaskedLM, + TFTapasForQuestionAnswering, + TFTapasForSequenceClassification, + TFTapasModel, + TFTapasPreTrainedModel, + ) + + else: import sys diff --git a/src/transformers/models/tapas/modeling_tf_tapas.py b/src/transformers/models/tapas/modeling_tf_tapas.py new file mode 100644 index 0000000000..ada1194067 --- /dev/null +++ b/src/transformers/models/tapas/modeling_tf_tapas.py @@ -0,0 +1,2398 @@ +# coding=utf-8 +# Copyright 2021 Google Research and The HuggingFace Inc. team. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +"""TF 2.0 TAPAS model. """ + +import enum +import math +from dataclasses import dataclass +from typing import Dict, Optional, Tuple, Union + +import numpy as np +import tensorflow as tf + +from ...activations_tf import get_tf_activation +from ...file_utils import ( + ModelOutput, + add_code_sample_docstrings, + add_start_docstrings, + add_start_docstrings_to_model_forward, + is_tensorflow_probability_available, + requires_backends, +) +from ...modeling_tf_outputs import ( + TFBaseModelOutputWithPastAndCrossAttentions, + TFBaseModelOutputWithPooling, + TFMaskedLMOutput, + TFSequenceClassifierOutput, +) +from ...modeling_tf_utils import ( + TFMaskedLanguageModelingLoss, + TFModelInputType, + TFPreTrainedModel, + TFSequenceClassificationLoss, + get_initializer, + input_processing, + keras_serializable, + shape_list, +) +from ...utils import logging +from .configuration_tapas import TapasConfig + + +logger = logging.get_logger(__name__) + +# soft dependency +if is_tensorflow_probability_available(): + try: + import tensorflow_probability as tfp + + # On the first call, check whether a compatible version of TensorFlow is installed + # TensorFlow Probability depends on a recent stable release of TensorFlow + n = tfp.distributions.Normal(loc=0.0, scale=1.0) + except ImportError: + logger.error( + "TAPAS models are not usable since `tensorflow_probability` can't be loaded." + "It seems you have `tensorflow_probability` installed with the wrong tensorflow version." + "Please try to reinstall it following the instructions here: https://github.com/tensorflow/probability." + ) + +_CONFIG_FOR_DOC = "TapasConfig" +_TOKENIZER_FOR_DOC = "TapasTokenizer" +_CHECKPOINT_FOR_DOC = "google/tapas-base" + +TF_TAPAS_PRETRAINED_MODEL_ARCHIVE_LIST = [ + # large models + "google/tapas-large", + "google/tapas-large-finetuned-sqa", + "google/tapas-large-finetuned-wtq", + "google/tapas-large-finetuned-wikisql-supervised", + "google/tapas-large-finetuned-tabfact", + # base models + "google/tapas-base", + "google/tapas-base-finetuned-sqa", + "google/tapas-base-finetuned-wtq", + "google/tapas-base-finetuned-wikisql-supervised", + "google/tapas-base-finetuned-tabfact", + # small models + "google/tapas-small", + "google/tapas-small-finetuned-sqa", + "google/tapas-small-finetuned-wtq", + "google/tapas-small-finetuned-wikisql-supervised", + "google/tapas-small-finetuned-tabfact", + # mini models + "google/tapas-mini", + "google/tapas-mini-finetuned-sqa", + "google/tapas-mini-finetuned-wtq", + "google/tapas-mini-finetuned-wikisql-supervised", + "google/tapas-mini-finetuned-tabfact", + # tiny models + "google/tapas-tiny", + "google/tapas-tiny-finetuned-sqa", + "google/tapas-tiny-finetuned-wtq", + "google/tapas-tiny-finetuned-wikisql-supervised", + "google/tapas-tiny-finetuned-tabfact", + # See all TAPAS models at https://huggingface.co/models?filter=tapas +] + +EPSILON_ZERO_DIVISION = 1e-10 +CLOSE_ENOUGH_TO_LOG_ZERO = -10000.0 + + +@dataclass +class TFTableQuestionAnsweringOutput(ModelOutput): + """ + Output type of :class:`~transformers.TFTapasForQuestionAnswering`. + + Args: + loss (:obj:`tf.Tensor` of shape :obj:`(1,)`, `optional`, returned when :obj:`labels` (and possibly :obj:`answer`, :obj:`aggregation_labels`, :obj:`numeric_values` and :obj:`numeric_values_scale` are provided)): + Total loss as the sum of the hierarchical cell selection log-likelihood loss and (optionally) the + semi-supervised regression loss and (optionally) supervised loss for aggregations. + logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`): + Prediction scores of the cell selection head, for every token. + logits_aggregation (:obj:`tf.Tensor`, `optional`, of shape :obj:`(batch_size, num_aggregation_labels)`): + Prediction scores of the aggregation head, for every aggregation operator. + hidden_states (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_hidden_states=True`` is passed or when ``config.output_hidden_states=True``): + Tuple of :obj:`tf.Tensor` (one for the output of the embeddings + one for the output of each layer) of + shape :obj:`(batch_size, sequence_length, hidden_size)`. Hidden-states of the model at the output of each + layer plus the initial embedding outputs. + attentions (:obj:`tuple(tf.Tensor)`, `optional`, returned when ``output_attentions=True`` is passed or when ``config.output_attentions=True``): + Tuple of :obj:`tf.Tensor` (one for each layer) of shape :obj:`(batch_size, num_heads, sequence_length, + sequence_length)`. Attentions weights after the attention softmax, used to compute the weighted average in + the self-attention heads. + """ + + loss: Optional[tf.Tensor] = None + logits: tf.Tensor = None + logits_aggregation: Optional[tf.Tensor] = None + hidden_states: Optional[Tuple[tf.Tensor]] = None + attentions: Optional[Tuple[tf.Tensor]] = None + + +class TFTapasEmbeddings(tf.keras.layers.Layer): + """ + Construct the embeddings from word, position and token_type embeddings. Same as BertEmbeddings but with a number of + additional token type embeddings to encode tabular structure. + """ + + def __init__(self, config: TapasConfig, **kwargs): + super().__init__(**kwargs) + + self.vocab_size = config.vocab_size + self.type_vocab_sizes = config.type_vocab_sizes + self.number_of_token_type_embeddings = len(config.type_vocab_sizes) + self.reset_position_index_per_cell = config.reset_position_index_per_cell + self.hidden_size = config.hidden_size + self.max_position_embeddings = config.max_position_embeddings + self.initializer_range = config.initializer_range + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + + def build(self, input_shape: tf.TensorShape): + with tf.name_scope("word_embeddings"): + self.weight = self.add_weight( + name="weight", + shape=[self.vocab_size, self.hidden_size], + initializer=get_initializer(self.initializer_range), + ) + + with tf.name_scope("position_embeddings"): + self.position_embeddings = self.add_weight( + name="embeddings", + shape=[self.max_position_embeddings, self.hidden_size], + initializer=get_initializer(self.initializer_range), + ) + for i, type_vocab_size in enumerate(self.type_vocab_sizes): + with tf.name_scope(f"token_type_embeddings_{i}"): + setattr( + self, + f"token_type_embeddings_{i}", + self.add_weight( + name="embeddings", + shape=[type_vocab_size, self.hidden_size], + initializer=get_initializer(self.initializer_range), + ), + ) + + super().build(input_shape) + + def call( + self, + input_ids: tf.Tensor = None, + position_ids: tf.Tensor = None, + token_type_ids: tf.Tensor = None, + inputs_embeds: tf.Tensor = None, + training: bool = False, + ) -> tf.Tensor: + """ + Applies embedding based on inputs tensor. + + Returns: + final_embeddings (:obj:`tf.Tensor`): output embedding tensor. + """ + assert not (input_ids is None and inputs_embeds is None) + if input_ids is not None: + input_shape = shape_list(input_ids) + else: + input_shape = shape_list(inputs_embeds)[:-1] + + seq_length = input_shape[1] + + if token_type_ids is None: + token_type_ids = tf.fill(dims=input_shape + [self.number_of_token_type_embeddings], value=0) + + if position_ids is None: + # create absolute position embeddings + position_ids = tf.expand_dims(tf.range(start=0, limit=seq_length), axis=0) + position_ids = tf.broadcast_to(position_ids, shape=input_shape) + # when self.config.reset_position_index_per_cell is set to True, create relative position embeddings + if self.reset_position_index_per_cell: + + # shape (batch_size, seq_len) + col_index = IndexMap(token_type_ids[:, :, 1], self.type_vocab_sizes[1], batch_dims=1) + # shape (batch_size, seq_len) + row_index = IndexMap(token_type_ids[:, :, 2], self.type_vocab_sizes[2], batch_dims=1) + # shape (batch_size, seq_len) + full_index = ProductIndexMap(col_index, row_index) + # shape (max_rows * max_columns,). First absolute position for every cell + first_position_per_segment = reduce_min(position_ids, full_index)[0] + # ? shape (batch_size, seq_len). First absolute position of the cell for every token + first_position = gather(first_position_per_segment, full_index) + # shape (1, seq_len) + position = tf.expand_dims(tf.range(start=0, limit=seq_length), axis=0) + position_ids = tf.math.minimum(self.max_position_embeddings - 1, position - first_position) + + if input_ids is not None: + inputs_embeds = tf.gather(params=self.weight, indices=input_ids) + + position_embeddings = tf.gather(self.position_embeddings, indices=position_ids) + + final_embeddings = inputs_embeds + position_embeddings + + for i in range(self.number_of_token_type_embeddings): + name = f"token_type_embeddings_{i}" + final_embeddings += tf.gather(params=getattr(self, name), indices=token_type_ids[:, :, i]) + + final_embeddings = self.LayerNorm(inputs=final_embeddings) + final_embeddings = self.dropout(inputs=final_embeddings, training=training) + + return final_embeddings + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfAttention with Bert->Tapas +class TFTapasSelfAttention(tf.keras.layers.Layer): + def __init__(self, config: TapasConfig, **kwargs): + super().__init__(**kwargs) + + if config.hidden_size % config.num_attention_heads != 0: + raise ValueError( + f"The hidden size ({config.hidden_size}) is not a multiple of the number " + f"of attention heads ({config.num_attention_heads})" + ) + + self.num_attention_heads = config.num_attention_heads + self.attention_head_size = int(config.hidden_size / config.num_attention_heads) + self.all_head_size = self.num_attention_heads * self.attention_head_size + self.sqrt_att_head_size = math.sqrt(self.attention_head_size) + + self.query = tf.keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="query" + ) + self.key = tf.keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="key" + ) + self.value = tf.keras.layers.Dense( + units=self.all_head_size, kernel_initializer=get_initializer(config.initializer_range), name="value" + ) + self.dropout = tf.keras.layers.Dropout(rate=config.attention_probs_dropout_prob) + + self.is_decoder = config.is_decoder + + def transpose_for_scores(self, tensor: tf.Tensor, batch_size: int) -> tf.Tensor: + # Reshape from [batch_size, seq_length, all_head_size] to [batch_size, seq_length, num_attention_heads, attention_head_size] + tensor = tf.reshape(tensor=tensor, shape=(batch_size, -1, self.num_attention_heads, self.attention_head_size)) + + # Transpose the tensor from [batch_size, seq_length, num_attention_heads, attention_head_size] to [batch_size, num_attention_heads, seq_length, attention_head_size] + return tf.transpose(tensor, perm=[0, 2, 1, 3]) + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + encoder_hidden_states: tf.Tensor, + encoder_attention_mask: tf.Tensor, + past_key_value: Tuple[tf.Tensor], + output_attentions: bool, + training: bool = False, + ) -> Tuple[tf.Tensor]: + batch_size = shape_list(hidden_states)[0] + mixed_query_layer = self.query(inputs=hidden_states) + + # If this is instantiated as a cross-attention module, the keys + # and values come from an encoder; the attention mask needs to be + # such that the encoder's padding tokens are not attended to. + is_cross_attention = encoder_hidden_states is not None + + if is_cross_attention and past_key_value is not None: + # reuse k,v, cross_attentions + key_layer = past_key_value[0] + value_layer = past_key_value[1] + attention_mask = encoder_attention_mask + elif is_cross_attention: + key_layer = self.transpose_for_scores(self.key(inputs=encoder_hidden_states), batch_size) + value_layer = self.transpose_for_scores(self.value(inputs=encoder_hidden_states), batch_size) + attention_mask = encoder_attention_mask + elif past_key_value is not None: + key_layer = self.transpose_for_scores(self.key(inputs=hidden_states), batch_size) + value_layer = self.transpose_for_scores(self.value(inputs=hidden_states), batch_size) + key_layer = tf.concatenate([past_key_value[0], key_layer], dim=2) + value_layer = tf.concatenate([past_key_value[1], value_layer], dim=2) + else: + key_layer = self.transpose_for_scores(self.key(inputs=hidden_states), batch_size) + value_layer = self.transpose_for_scores(self.value(inputs=hidden_states), batch_size) + + query_layer = self.transpose_for_scores(mixed_query_layer, batch_size) + + if self.is_decoder: + # if cross_attention save Tuple(tf.Tensor, tf.Tensor) of all cross attention key/value_states. + # Further calls to cross_attention layer can then reuse all cross-attention + # key/value_states (first "if" case) + # if uni-directional self-attention (decoder) save Tuple(tf.Tensor, tf.Tensor) of + # all previous decoder key/value_states. Further calls to uni-directional self-attention + # can concat previous decoder key/value_states to current projected key/value_states (third "elif" case) + # if encoder bi-directional self-attention `past_key_value` is always `None` + past_key_value = (key_layer, value_layer) + + # Take the dot product between "query" and "key" to get the raw attention scores. + # (batch size, num_heads, seq_len_q, seq_len_k) + attention_scores = tf.matmul(query_layer, key_layer, transpose_b=True) + dk = tf.cast(self.sqrt_att_head_size, dtype=attention_scores.dtype) + attention_scores = tf.divide(attention_scores, dk) + + if attention_mask is not None: + # Apply the attention mask is (precomputed for all layers in TFTapasModel call() function) + attention_scores = tf.add(attention_scores, attention_mask) + + # Normalize the attention scores to probabilities. + attention_probs = tf.nn.softmax(logits=attention_scores, axis=-1) + + # This is actually dropping out entire tokens to attend to, which might + # seem a bit unusual, but is taken from the original Transformer paper. + attention_probs = self.dropout(inputs=attention_probs, training=training) + + # Mask heads if we want to + if head_mask is not None: + attention_probs = tf.multiply(attention_probs, head_mask) + + attention_output = tf.matmul(attention_probs, value_layer) + attention_output = tf.transpose(attention_output, perm=[0, 2, 1, 3]) + + # (batch_size, seq_len_q, all_head_size) + attention_output = tf.reshape(tensor=attention_output, shape=(batch_size, -1, self.all_head_size)) + outputs = (attention_output, attention_probs) if output_attentions else (attention_output,) + + if self.is_decoder: + outputs = outputs + (past_key_value,) + return outputs + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertSelfOutput with Bert->Tapas +class TFTapasSelfOutput(tf.keras.layers.Layer): + def __init__(self, config: TapasConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = tf.keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) + + return hidden_states + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertAttention with Bert->Tapas +class TFTapasAttention(tf.keras.layers.Layer): + def __init__(self, config: TapasConfig, **kwargs): + super().__init__(**kwargs) + + self.self_attention = TFTapasSelfAttention(config, name="self") + self.dense_output = TFTapasSelfOutput(config, name="output") + + def prune_heads(self, heads): + raise NotImplementedError + + def call( + self, + input_tensor: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + encoder_hidden_states: tf.Tensor, + encoder_attention_mask: tf.Tensor, + past_key_value: Tuple[tf.Tensor], + output_attentions: bool, + training: bool = False, + ) -> Tuple[tf.Tensor]: + self_outputs = self.self_attention( + hidden_states=input_tensor, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_value=past_key_value, + output_attentions=output_attentions, + training=training, + ) + attention_output = self.dense_output( + hidden_states=self_outputs[0], input_tensor=input_tensor, training=training + ) + # add attentions (possibly with past_key_value) if we output them + outputs = (attention_output,) + self_outputs[1:] + + return outputs + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertIntermediate with Bert->Tapas +class TFTapasIntermediate(tf.keras.layers.Layer): + def __init__(self, config: TapasConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = tf.keras.layers.Dense( + units=config.intermediate_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + + if isinstance(config.hidden_act, str): + self.intermediate_act_fn = get_tf_activation(config.hidden_act) + else: + self.intermediate_act_fn = config.hidden_act + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.intermediate_act_fn(hidden_states) + + return hidden_states + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertOutput with Bert->Tapas +class TFTapasOutput(tf.keras.layers.Layer): + def __init__(self, config: TapasConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = tf.keras.layers.Dense( + units=config.hidden_size, kernel_initializer=get_initializer(config.initializer_range), name="dense" + ) + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + self.dropout = tf.keras.layers.Dropout(rate=config.hidden_dropout_prob) + + def call(self, hidden_states: tf.Tensor, input_tensor: tf.Tensor, training: bool = False) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.dropout(inputs=hidden_states, training=training) + hidden_states = self.LayerNorm(inputs=hidden_states + input_tensor) + + return hidden_states + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertLayer with Bert->Tapas +class TFTapasLayer(tf.keras.layers.Layer): + def __init__(self, config: TapasConfig, **kwargs): + super().__init__(**kwargs) + + self.attention = TFTapasAttention(config, name="attention") + self.is_decoder = config.is_decoder + self.add_cross_attention = config.add_cross_attention + if self.add_cross_attention: + if not self.is_decoder: + raise ValueError(f"{self} should be used as a decoder model if cross attention is added") + self.crossattention = TFTapasAttention(config, name="crossattention") + self.intermediate = TFTapasIntermediate(config, name="intermediate") + self.bert_output = TFTapasOutput(config, name="output") + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + encoder_hidden_states: Optional[tf.Tensor], + encoder_attention_mask: Optional[tf.Tensor], + past_key_value: Optional[Tuple[tf.Tensor]], + output_attentions: bool, + training: bool = False, + ) -> Tuple[tf.Tensor]: + # decoder uni-directional self-attention cached key/values tuple is at positions 1,2 + self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None + self_attention_outputs = self.attention( + input_tensor=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_value=self_attn_past_key_value, + output_attentions=output_attentions, + training=training, + ) + attention_output = self_attention_outputs[0] + + # if decoder, the last output is tuple of self-attn cache + if self.is_decoder: + outputs = self_attention_outputs[1:-1] + present_key_value = self_attention_outputs[-1] + else: + outputs = self_attention_outputs[1:] # add self attentions if we output attention weights + + cross_attn_present_key_value = None + if self.is_decoder and encoder_hidden_states is not None: + if not hasattr(self, "crossattention"): + raise ValueError( + f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers " + "by setting `config.add_cross_attention=True`" + ) + + # cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple + cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None + cross_attention_outputs = self.crossattention( + input_tensor=attention_output, + attention_mask=attention_mask, + head_mask=head_mask, + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_value=cross_attn_past_key_value, + output_attentions=output_attentions, + training=training, + ) + attention_output = cross_attention_outputs[0] + outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights + + # add cross-attn cache to positions 3,4 of present_key_value tuple + cross_attn_present_key_value = cross_attention_outputs[-1] + present_key_value = present_key_value + cross_attn_present_key_value + + intermediate_output = self.intermediate(hidden_states=attention_output) + layer_output = self.bert_output( + hidden_states=intermediate_output, input_tensor=attention_output, training=training + ) + outputs = (layer_output,) + outputs # add attentions if we output them + + # if decoder, return the attn key/values as the last output + if self.is_decoder: + outputs = outputs + (present_key_value,) + + return outputs + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertEncoder with Bert->Tapas +class TFTapasEncoder(tf.keras.layers.Layer): + def __init__(self, config: TapasConfig, **kwargs): + super().__init__(**kwargs) + self.config = config + self.layer = [TFTapasLayer(config, name=f"layer_._{i}") for i in range(config.num_hidden_layers)] + + def call( + self, + hidden_states: tf.Tensor, + attention_mask: tf.Tensor, + head_mask: tf.Tensor, + encoder_hidden_states: Optional[tf.Tensor], + encoder_attention_mask: Optional[tf.Tensor], + past_key_values: Optional[Tuple[Tuple[tf.Tensor]]], + use_cache: Optional[bool], + output_attentions: bool, + output_hidden_states: bool, + return_dict: bool, + training: bool = False, + ) -> Union[TFBaseModelOutputWithPastAndCrossAttentions, Tuple[tf.Tensor]]: + all_hidden_states = () if output_hidden_states else None + all_attentions = () if output_attentions else None + all_cross_attentions = () if output_attentions and self.config.add_cross_attention else None + + next_decoder_cache = () if use_cache else None + for i, layer_module in enumerate(self.layer): + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + past_key_value = past_key_values[i] if past_key_values is not None else None + + layer_outputs = layer_module( + hidden_states=hidden_states, + attention_mask=attention_mask, + head_mask=head_mask[i], + encoder_hidden_states=encoder_hidden_states, + encoder_attention_mask=encoder_attention_mask, + past_key_value=past_key_value, + output_attentions=output_attentions, + training=training, + ) + hidden_states = layer_outputs[0] + + if use_cache: + next_decoder_cache += (layer_outputs[-1],) + + if output_attentions: + all_attentions = all_attentions + (layer_outputs[1],) + if self.config.add_cross_attention and encoder_hidden_states is not None: + all_cross_attentions = all_cross_attentions + (layer_outputs[2],) + + # Add last layer + if output_hidden_states: + all_hidden_states = all_hidden_states + (hidden_states,) + + if not return_dict: + return tuple( + v for v in [hidden_states, all_hidden_states, all_attentions, all_cross_attentions] if v is not None + ) + + return TFBaseModelOutputWithPastAndCrossAttentions( + last_hidden_state=hidden_states, + past_key_values=next_decoder_cache, + hidden_states=all_hidden_states, + attentions=all_attentions, + cross_attentions=all_cross_attentions, + ) + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertPooler with Bert->Tapas +class TFTapasPooler(tf.keras.layers.Layer): + def __init__(self, config: TapasConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = tf.keras.layers.Dense( + units=config.hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + activation="tanh", + name="dense", + ) + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + # We "pool" the model by simply taking the hidden state corresponding + # to the first token. + first_token_tensor = hidden_states[:, 0] + pooled_output = self.dense(inputs=first_token_tensor) + + return pooled_output + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertPredictionHeadTransform with Bert->Tapas +class TFTapasPredictionHeadTransform(tf.keras.layers.Layer): + def __init__(self, config: TapasConfig, **kwargs): + super().__init__(**kwargs) + + self.dense = tf.keras.layers.Dense( + units=config.hidden_size, + kernel_initializer=get_initializer(config.initializer_range), + name="dense", + ) + + if isinstance(config.hidden_act, str): + self.transform_act_fn = get_tf_activation(config.hidden_act) + else: + self.transform_act_fn = config.hidden_act + + self.LayerNorm = tf.keras.layers.LayerNormalization(epsilon=config.layer_norm_eps, name="LayerNorm") + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.dense(inputs=hidden_states) + hidden_states = self.transform_act_fn(hidden_states) + hidden_states = self.LayerNorm(inputs=hidden_states) + + return hidden_states + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertLMPredictionHead with Bert->Tapas +class TFTapasLMPredictionHead(tf.keras.layers.Layer): + def __init__(self, config: TapasConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): + super().__init__(**kwargs) + + self.vocab_size = config.vocab_size + self.hidden_size = config.hidden_size + + self.transform = TFTapasPredictionHeadTransform(config, name="transform") + + # The output weights are the same as the input embeddings, but there is + # an output-only bias for each token. + self.input_embeddings = input_embeddings + + def build(self, input_shape: tf.TensorShape): + self.bias = self.add_weight(shape=(self.vocab_size,), initializer="zeros", trainable=True, name="bias") + + super().build(input_shape) + + def get_output_embeddings(self) -> tf.keras.layers.Layer: + return self.input_embeddings + + def set_output_embeddings(self, value: tf.Variable): + self.input_embeddings.weight = value + self.input_embeddings.vocab_size = shape_list(value)[0] + + def get_bias(self) -> Dict[str, tf.Variable]: + return {"bias": self.bias} + + def set_bias(self, value: tf.Variable): + self.bias = value["bias"] + self.vocab_size = shape_list(value["bias"])[0] + + def call(self, hidden_states: tf.Tensor) -> tf.Tensor: + hidden_states = self.transform(hidden_states=hidden_states) + seq_length = shape_list(hidden_states)[1] + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, self.hidden_size]) + hidden_states = tf.matmul(a=hidden_states, b=self.input_embeddings.weight, transpose_b=True) + hidden_states = tf.reshape(tensor=hidden_states, shape=[-1, seq_length, self.vocab_size]) + hidden_states = tf.nn.bias_add(value=hidden_states, bias=self.bias) + + return hidden_states + + +# Copied from transformers.models.bert.modeling_tf_bert.TFBertMLMHead with Bert->Tapas +class TFTapasMLMHead(tf.keras.layers.Layer): + def __init__(self, config: TapasConfig, input_embeddings: tf.keras.layers.Layer, **kwargs): + super().__init__(**kwargs) + + self.predictions = TFTapasLMPredictionHead(config, input_embeddings, name="predictions") + + def call(self, sequence_output: tf.Tensor) -> tf.Tensor: + prediction_scores = self.predictions(hidden_states=sequence_output) + + return prediction_scores + + +@keras_serializable +class TFTapasMainLayer(tf.keras.layers.Layer): + config_class = TapasConfig + + def __init__(self, config: TapasConfig, add_pooling_layer: bool = True, **kwargs): + requires_backends(self, "tensorflow_probability") + super().__init__(**kwargs) + + self.config = config + + self.embeddings = TFTapasEmbeddings(config, name="embeddings") + self.encoder = TFTapasEncoder(config, name="encoder") + self.pooler = TFTapasPooler(config, name="pooler") if add_pooling_layer else None + + def get_input_embeddings(self) -> tf.keras.layers.Layer: + return self.embeddings + + def set_input_embeddings(self, value: tf.Variable): + self.embeddings.weight = value + self.embeddings.vocab_size = shape_list(value)[0] + + def _prune_heads(self, heads_to_prune): + """ + Prunes heads of the model. heads_to_prune: dict of {layer_num: list of heads to prune in this layer} See base + class PreTrainedModel + """ + raise NotImplementedError + + def call( + self, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: bool = False, + **kwargs, + ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]: + inputs = input_processing( + func=self.call, + config=self.config, + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + kwargs_call=kwargs, + ) + + if inputs["input_ids"] is not None and inputs["inputs_embeds"] is not None: + raise ValueError("You cannot specify both input_ids and inputs_embeds at the same time") + elif inputs["input_ids"] is not None: + input_shape = shape_list(inputs["input_ids"]) + elif inputs["inputs_embeds"] is not None: + input_shape = shape_list(inputs["inputs_embeds"])[:-1] + else: + raise ValueError("You have to specify either input_ids or inputs_embeds") + + if inputs["attention_mask"] is None: + inputs["attention_mask"] = tf.fill(dims=input_shape, value=1) + + if inputs["token_type_ids"] is None: + inputs["token_type_ids"] = tf.fill(dims=input_shape + [len(self.config.type_vocab_sizes)], value=0) + + embedding_output = self.embeddings( + input_ids=inputs["input_ids"], + position_ids=inputs["position_ids"], + token_type_ids=inputs["token_type_ids"], + inputs_embeds=inputs["inputs_embeds"], + training=inputs["training"], + ) + + # We create a 3D attention mask from a 2D tensor mask. + # Sizes are [batch_size, 1, 1, to_seq_length] + # So we can broadcast to [batch_size, num_heads, from_seq_length, to_seq_length] + # this attention mask is more simple than the triangular masking of causal attention + # used in OpenAI GPT, we just need to prepare the broadcast dimension here. + extended_attention_mask = tf.reshape(inputs["attention_mask"], (input_shape[0], 1, 1, input_shape[1])) + + # Since attention_mask is 1.0 for positions we want to attend and 0.0 for + # masked positions, this operation will create a tensor which is 0.0 for + # positions we want to attend and -10000.0 for masked positions. + # Since we are adding it to the raw scores before the softmax, this is + # effectively the same as removing these entirely. + extended_attention_mask = tf.cast(extended_attention_mask, dtype=embedding_output.dtype) + one_cst = tf.constant(1.0, dtype=embedding_output.dtype) + ten_thousand_cst = tf.constant(-10000.0, dtype=embedding_output.dtype) + extended_attention_mask = tf.multiply(tf.subtract(one_cst, extended_attention_mask), ten_thousand_cst) + + # Prepare head mask if needed + # 1.0 in head_mask indicate we keep the head + # attention_probs has shape bsz x n_heads x N x N + # input head_mask has shape [num_heads] or [num_hidden_layers x num_heads] + # and head_mask is converted to shape [num_hidden_layers x batch x num_heads x seq_length x seq_length] + if inputs["head_mask"] is not None: + raise NotImplementedError + else: + inputs["head_mask"] = [None] * self.config.num_hidden_layers + + encoder_outputs = self.encoder( + hidden_states=embedding_output, + attention_mask=extended_attention_mask, + head_mask=inputs["head_mask"], + encoder_hidden_states=None, + encoder_attention_mask=None, + past_key_values=None, + use_cache=None, + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], + return_dict=inputs["return_dict"], + training=inputs["training"], + ) + + sequence_output = encoder_outputs[0] + pooled_output = self.pooler(hidden_states=sequence_output) if self.pooler is not None else None + + if not inputs["return_dict"]: + return ( + sequence_output, + pooled_output, + ) + encoder_outputs[1:] + + return TFBaseModelOutputWithPooling( + last_hidden_state=sequence_output, + pooler_output=pooled_output, + hidden_states=encoder_outputs.hidden_states, + attentions=encoder_outputs.attentions, + ) + + +class TFTapasPreTrainedModel(TFPreTrainedModel): + """ + An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained + models. + """ + + config_class = TapasConfig + base_model_prefix = "tapas" + + +TAPAS_START_DOCSTRING = r""" + + This model inherits from :class:`~transformers.TFPreTrainedModel`. Check the superclass documentation for the + generic methods the library implements for all its model (such as downloading or saving, resizing the input + embeddings, pruning heads etc.) + + This model is also a `tf.keras.Model `__ subclass. Use + it as a regular TF 2.0 Keras Model and refer to the TF 2.0 documentation for all matter related to general usage + and behavior. + + .. note:: + + TF 2.0 models accepts two formats as inputs: + + - having all inputs as keyword arguments (like PyTorch models), or + - having all inputs as a list, tuple or dict in the first positional arguments. + + This second option is useful when using :meth:`tf.keras.Model.fit` method which currently requires having all + the tensors in the first argument of the model call function: :obj:`model(inputs)`. + + If you choose this second option, there are three possibilities you can use to gather all the input Tensors in + the first positional argument : + + - a single Tensor with :obj:`input_ids` only and nothing else: :obj:`model(inputs_ids)` + - a list of varying length with one or several input Tensors IN THE ORDER given in the docstring: + :obj:`model([input_ids, attention_mask])` or :obj:`model([input_ids, attention_mask, token_type_ids])` + - a dictionary with one or several input Tensors associated to the input names given in the docstring: + :obj:`model({"input_ids": input_ids, "token_type_ids": token_type_ids})` + + Parameters: + config (:class:`~transformers.TapasConfig`): Model configuration class with all the parameters of the model. + Initializing with a config file does not load the weights associated with the model, only the + configuration. Check out the :meth:`~transformers.PreTrainedModel.from_pretrained` method to load the model + weights. +""" + +TAPAS_INPUTS_DOCSTRING = r""" + Args: + input_ids (:obj:`np.ndarray`, :obj:`tf.Tensor`, :obj:`List[tf.Tensor]` :obj:`Dict[str, tf.Tensor]` or :obj:`Dict[str, np.ndarray]` and each example must have the shape :obj:`({0})`): + Indices of input sequence tokens in the vocabulary. + + Indices can be obtained using :class:`~transformers.TapasTokenizer`. See + :func:`transformers.PreTrainedTokenizer.__call__` and :func:`transformers.PreTrainedTokenizer.encode` for + details. + + `What are input IDs? <../glossary.html#input-ids>`__ + attention_mask (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`): + Mask to avoid performing attention on padding token indices. Mask values selected in ``[0, 1]``: + + - 1 for tokens that are **not masked**, + - 0 for tokens that are **masked**. + + `What are attention masks? <../glossary.html#attention-mask>`__ + token_type_ids (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0}, 7)`, `optional`): + Token indices that encode tabular structure. Indices can be obtained using + :class:`~transformers.TapasTokenizer`. See this class for more info. + + `What are token type IDs? <../glossary.html#token-type-ids>`__ + position_ids (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0})`, `optional`): + Indices of positions of each input sequence tokens in the position embeddings. If + ``reset_position_index_per_cell`` of :class:`~transformers.TapasConfig` is set to ``True``, relative + position embeddings will be used. Selected in the range ``[0, config.max_position_embeddings - 1]``. + + `What are position IDs? <../glossary.html#position-ids>`__ + head_mask (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`(num_heads,)` or :obj:`(num_layers, num_heads)`, `optional`): + Mask to nullify selected heads of the self-attention modules. Mask values selected in ``[0, 1]``: + + - 1 indicates the head is **not masked**, + - 0 indicates the head is **masked**. + + inputs_embeds (:obj:`np.ndarray` or :obj:`tf.Tensor` of shape :obj:`({0}, hidden_size)`, `optional`): + Optionally, instead of passing :obj:`input_ids` you can choose to directly pass an embedded representation. + This is useful if you want more control over how to convert :obj:`input_ids` indices into associated + vectors than the model's internal embedding lookup matrix. + output_attentions (:obj:`bool`, `optional`): + Whether or not to return the attentions tensors of all attention layers. See ``attentions`` under returned + tensors for more detail. This argument can be used only in eager mode, in graph mode the value in the + config will be used instead. + output_hidden_states (:obj:`bool`, `optional`): + Whether or not to return the hidden states of all layers. See ``hidden_states`` under returned tensors for + more detail. This argument can be used only in eager mode, in graph mode the value in the config will be + used instead. + return_dict (:obj:`bool`, `optional`): + Whether or not to return a :class:`~transformers.file_utils.ModelOutput` instead of a plain tuple. This + argument can be used in eager mode, in graph mode the value will always be set to True. + training (:obj:`bool`, `optional`, defaults to :obj:`False`): + Whether or not to use the model in training mode (some modules like dropout modules have different + behaviors between training and evaluation). +""" + + +@add_start_docstrings( + "The bare Tapas Model transformer outputting raw hidden-states without any specific head on top.", + TAPAS_START_DOCSTRING, +) +class TFTapasModel(TFTapasPreTrainedModel): + def __init__(self, config: TapasConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + self.tapas = TFTapasMainLayer(config, name="tapas") + + @add_start_docstrings_to_model_forward(TAPAS_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFBaseModelOutputWithPooling, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + training: Optional[bool] = False, + **kwargs, + ) -> Union[TFBaseModelOutputWithPooling, Tuple[tf.Tensor]]: + r""" + Returns: + + Examples:: + + >>> from transformers import TapasTokenizer, TapasModel + >>> import pandas as pd + + >>> tokenizer = TapasTokenizer.from_pretrained('google/tapas-base') + >>> model = TapasModel.from_pretrained('google/tapas-base') + + >>> data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], + ... 'Age': ["56", "45", "59"], + ... 'Number of movies': ["87", "53", "69"] + ... } + >>> table = pd.DataFrame.from_dict(data) + >>> queries = ["How many movies has George Clooney played in?", "How old is Brad Pitt?"] + + >>> inputs = tokenizer(table=table, queries=queries, padding="max_length", return_tensors="tf") + >>> outputs = model(**inputs) + + >>> last_hidden_states = outputs.last_hidden_state + """ + inputs = input_processing( + func=self.call, + config=self.config, + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + training=training, + kwargs_call=kwargs, + ) + outputs = self.tapas( + input_ids=inputs["input_ids"], + attention_mask=inputs["attention_mask"], + token_type_ids=inputs["token_type_ids"], + position_ids=inputs["position_ids"], + head_mask=inputs["head_mask"], + inputs_embeds=inputs["inputs_embeds"], + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], + return_dict=inputs["return_dict"], + training=inputs["training"], + ) + + return outputs + + def serving_output(self, output: TFBaseModelOutputWithPooling) -> TFBaseModelOutputWithPooling: + hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None + attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None + + return TFBaseModelOutputWithPooling( + last_hidden_state=output.last_hidden_state, + pooler_output=output.pooler_output, + hidden_states=hs, + attentions=attns, + ) + + +@add_start_docstrings("""Tapas Model with a `language modeling` head on top. """, TAPAS_START_DOCSTRING) +class TFTapasForMaskedLM(TFTapasPreTrainedModel, TFMaskedLanguageModelingLoss): + def __init__(self, config: TapasConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + if config.is_decoder: + logger.warning( + "If you want to use `TFTapasForMaskedLM` make sure `config.is_decoder=False` for " + "bi-directional self-attention." + ) + + self.tapas = TFTapasMainLayer(config, add_pooling_layer=False, name="tapas") + self.lm_head = TFTapasMLMHead(config, input_embeddings=self.tapas.embeddings, name="cls") + + def get_lm_head(self) -> tf.keras.layers.Layer: + return self.lm_head.predictions + + @add_start_docstrings_to_model_forward(TAPAS_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFMaskedLMOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[Union[np.ndarray, tf.Tensor]] = None, + training: Optional[bool] = False, + **kwargs, + ) -> Union[TFMaskedLMOutput, Tuple[tf.Tensor]]: + r""" + labels (:obj:`tf.Tensor` or :obj:`np.ndarray` of shape :obj:`(batch_size, sequence_length)`, `optional`): + Labels for computing the masked language modeling loss. Indices should be in ``[-100, 0, ..., + config.vocab_size]`` (see ``input_ids`` docstring) Tokens with indices set to ``-100`` are ignored + (masked), the loss is only computed for the tokens with labels in ``[0, ..., config.vocab_size]`` + + Returns: + + Examples:: + + >>> from transformers import TapasTokenizer, TapasForMaskedLM + >>> import pandas as pd + + >>> tokenizer = TapasTokenizer.from_pretrained('google/tapas-base') + >>> model = TapasForMaskedLM.from_pretrained('google/tapas-base') + + >>> data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], + ... 'Age': ["56", "45", "59"], + ... 'Number of movies': ["87", "53", "69"] + ... } + >>> table = pd.DataFrame.from_dict(data) + + >>> inputs = tokenizer(table=table, queries="How many [MASK] has George [MASK] played in?", return_tensors="tf") + >>> labels = tokenizer(table=table, queries="How many movies has George Clooney played in?", return_tensors="tf")["input_ids"] + + >>> outputs = model(**inputs, labels=labels) + >>> last_hidden_states = outputs.last_hidden_state + """ + inputs = input_processing( + func=self.call, + config=self.config, + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + labels=labels, + training=training, + kwargs_call=kwargs, + ) + outputs = self.tapas( + input_ids=inputs["input_ids"], + attention_mask=inputs["attention_mask"], + token_type_ids=inputs["token_type_ids"], + position_ids=inputs["position_ids"], + head_mask=inputs["head_mask"], + inputs_embeds=inputs["inputs_embeds"], + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], + return_dict=inputs["return_dict"], + training=inputs["training"], + ) + sequence_output = outputs[0] + prediction_scores = self.lm_head(sequence_output) + loss = ( + None if inputs["labels"] is None else self.compute_loss(labels=inputs["labels"], logits=prediction_scores) + ) + + if not inputs["return_dict"]: + output = (prediction_scores,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFMaskedLMOutput( + loss=loss, + logits=prediction_scores, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def serving_output(self, output: TFMaskedLMOutput) -> TFMaskedLMOutput: + hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None + attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None + + return TFMaskedLMOutput(logits=output.logits, hidden_states=hs, attentions=attns) + + +class TFTapasComputeTokenLogits(tf.keras.layers.Layer): + def __init__(self, config: TapasConfig, **kwargs): + super().__init__(**kwargs) + + self.temperature = config.temperature + # cell selection heads + with tf.name_scope("output"): + self.output_weights = self.add_weight( + name="output_weights", + shape=(config.hidden_size,), + dtype=tf.float32, + trainable=True, + initializer=tf.zeros_initializer() + if config.init_cell_selection_weights_to_zero + else tf.keras.initializers.TruncatedNormal(stddev=config.initializer_range), + ) + self.output_bias = self.add_weight( + name="output_bias", shape=(), trainable=True, initializer=tf.zeros_initializer() + ) + + def call(self, sequence_output: tf.Tensor) -> tf.Tensor: + """ + Computes logits per token + + Args: + sequence_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): + Also known as last_hidden_state. Sequence of hidden-states at the output of the last layer of the + model. + + Returns: + logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`): Logits per token. + """ + logits = (tf.einsum("bsj,j->bs", sequence_output, self.output_weights) + self.output_bias) / self.temperature + return logits + + +class TFTapasComputeColumnLogits(tf.keras.layers.Layer): + def __init__(self, config: TapasConfig, **kwargs): + super().__init__(**kwargs) + + with tf.name_scope("column_output"): + self.column_output_weights = self.add_weight( + name="column_output_weights", + shape=[config.hidden_size], + dtype=tf.float32, + trainable=True, + initializer=tf.zeros_initializer() + if config.init_cell_selection_weights_to_zero + else tf.keras.initializers.TruncatedNormal(stddev=config.initializer_range), + ) + self.column_output_bias = self.add_weight( + name="column_output_bias", shape=(), trainable=True, initializer=tf.zeros_initializer() + ) + + def call(self, sequence_output, cell_index, cell_mask, allow_empty_column_selection) -> tf.Tensor: + """ + Computes the column logits. + + Args: + sequence_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length, hidden_size)`): + Also known as last_hidden_state. Sequence of hidden-states at the output of the last layer of the + model. + cell_index (:obj:`ProductIndexMap`): + Index that groups tokens into cells. + cell_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, max_num_rows * max_num_cols)`): + Mask for cells that exist in the table (i.e. that are not padding). + allow_empty_column_selection (:obj:`bool`): + Whether to allow not to select any column + + Returns: + column_logits (:obj:`tf.Tensor`of shape :obj:`(batch_size, max_num_cols)`): Tensor containing the column + logits for every example in the batch. + """ + + # First, compute the token logits (batch_size, seq_len) - without temperature + token_logits = tf.einsum("bsj,j->bs", sequence_output, self.column_output_weights) + self.column_output_bias + + # Next, average the logits per cell (batch_size, max_num_cols*max_num_rows) + cell_logits, cell_logits_index = reduce_mean(token_logits, cell_index) + + # Finally, average the logits per column (batch_size, max_num_cols) + column_index = cell_index.project_inner(cell_logits_index) + column_logits, out_index = reduce_sum(cell_logits * cell_mask, column_index) + + cell_count, _ = reduce_sum(cell_mask, column_index) + column_logits /= cell_count + EPSILON_ZERO_DIVISION + + # Mask columns that do not appear in the example. + is_padding = tf.logical_and(cell_count < 0.5, tf.not_equal(out_index.indices, 0)) + column_logits += CLOSE_ENOUGH_TO_LOG_ZERO * tf.cast(is_padding, tf.float32) + + if not allow_empty_column_selection: + column_logits += CLOSE_ENOUGH_TO_LOG_ZERO * tf.cast(tf.equal(out_index.indices, 0), tf.float32) + + return column_logits + + +@add_start_docstrings( + """ + Tapas Model with a cell selection head and optional aggregation head on top for question-answering tasks on tables + (linear layers on top of the hidden-states output to compute `logits` and optional `logits_aggregation`), e.g. for + SQA, WTQ or WikiSQL-supervised tasks. + """, + TAPAS_START_DOCSTRING, +) +class TFTapasForQuestionAnswering(TFTapasPreTrainedModel): + def __init__(self, config: TapasConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + + # base model + self.tapas = TFTapasMainLayer(config, name="tapas") + + # dropout + self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob) + + self.compute_token_logits = TFTapasComputeTokenLogits(config, name="compute_token_logits") + + self.compute_column_logits = TFTapasComputeColumnLogits(config, name="compute_column_logits") + + if config.num_aggregation_labels > 0: + self.aggregation_classifier = tf.keras.layers.Dense( + config.num_aggregation_labels, + kernel_initializer=get_initializer(config.initializer_range), + name="aggregation_classifier", + ) + self.config = config + + @add_start_docstrings_to_model_forward(TAPAS_INPUTS_DOCSTRING.format("batch_size, sequence_length")) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFTableQuestionAnsweringOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + table_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + aggregation_labels: Optional[Union[np.ndarray, tf.Tensor]] = None, + float_answer: Optional[Union[np.ndarray, tf.Tensor]] = None, + numeric_values: Optional[Union[np.ndarray, tf.Tensor]] = None, + numeric_values_scale: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[Union[np.ndarray, tf.Tensor]] = None, + training: Optional[bool] = False, + **kwargs, + ) -> Union[TFTableQuestionAnsweringOutput, Tuple[tf.Tensor]]: + r""" + table_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, seq_length)`, `optional`): + Mask for the table. Indicates which tokens belong to the table (1). Question tokens, table headers and + padding are 0. + labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, seq_length)`, `optional`): + Labels per token for computing the hierarchical cell selection loss. This encodes the positions of the + answer appearing in the table. Can be obtained using :class:`~transformers.TapasTokenizer`. + + - 1 for tokens that are **part of the answer**, + - 0 for tokens that are **not part of the answer**. + + aggregation_labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, )`, `optional`): + Aggregation function index for every example in the batch for computing the aggregation loss. Indices + should be in :obj:`[0, ..., config.num_aggregation_labels - 1]`. Only required in case of strong + supervision for aggregation (WikiSQL-supervised). + float_answer (:obj:`tf.Tensor` of shape :obj:`(batch_size, )`, `optional`): + Float answer for every example in the batch. Set to `float('nan')` for cell selection questions. Only + required in case of weak supervision (WTQ) to calculate the aggregate mask and regression loss. + numeric_values (:obj:`tf.Tensor` of shape :obj:`(batch_size, seq_length)`, `optional`): + Numeric values of every token, NaN for tokens which are not numeric values. Can be obtained using + :class:`~transformers.TapasTokenizer`. Only required in case of weak supervision for aggregation (WTQ) to + calculate the regression loss. + numeric_values_scale (:obj:`tf.Tensor` of shape :obj:`(batch_size, seq_length)`, `optional`): + Scale of the numeric values of every token. Can be obtained using :class:`~transformers.TapasTokenizer`. + Only required in case of weak supervision for aggregation (WTQ) to calculate the regression loss. + + Returns: + + Examples:: + + >>> from transformers import TapasTokenizer, TapasForQuestionAnswering + >>> import pandas as pd + + >>> tokenizer = TapasTokenizer.from_pretrained('google/tapas-base-finetuned-wtq') + >>> model = TapasForQuestionAnswering.from_pretrained('google/tapas-base-finetuned-wtq') + + >>> data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], + ... 'Age': ["56", "45", "59"], + ... 'Number of movies': ["87", "53", "69"] + ... } + >>> table = pd.DataFrame.from_dict(data) + >>> queries = ["How many movies has George Clooney played in?", "How old is Brad Pitt?"] + + >>> inputs = tokenizer(table=table, queries=queries, padding="max_length", return_tensors="tf") + >>> outputs = model(**inputs) + + >>> logits = outputs.logits + >>> logits_aggregation = outputs.logits_aggregation + """ + + inputs = input_processing( + func=self.call, + config=self.config, + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + table_mask=table_mask, + aggregation_labels=aggregation_labels, + float_answer=float_answer, + numeric_values=numeric_values, + numeric_values_scale=numeric_values_scale, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + labels=labels, + training=training, + kwargs_call=kwargs, + ) + outputs = self.tapas( + input_ids=inputs["input_ids"], + attention_mask=inputs["attention_mask"], + token_type_ids=inputs["token_type_ids"], + position_ids=inputs["position_ids"], + head_mask=inputs["head_mask"], + inputs_embeds=inputs["inputs_embeds"], + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], + return_dict=inputs["return_dict"], + training=inputs["training"], + ) + + sequence_output = outputs[0] + pooled_output = outputs[1] + + sequence_output = self.dropout(sequence_output) + + if inputs["input_ids"] is not None: + input_shape = shape_list(inputs["input_ids"]) + else: + input_shape = shape_list(inputs["inputs_embeds"])[:-1] + + # Construct indices for the table. + if inputs["token_type_ids"] is None: + inputs["token_type_ids"] = tf.fill(input_shape + [len(self.config.type_vocab_sizes)], 0) + + token_types = [ + "segment_ids", + "column_ids", + "row_ids", + "prev_labels", + "column_ranks", + "inv_column_ranks", + "numeric_relations", + ] + + row_ids = inputs["token_type_ids"][:, :, token_types.index("row_ids")] + column_ids = inputs["token_type_ids"][:, :, token_types.index("column_ids")] + + # Construct indices for the table. + row_index = IndexMap( + indices=tf.minimum(tf.cast(row_ids, tf.int32), self.config.max_num_rows - 1), + num_segments=self.config.max_num_rows, + batch_dims=1, + ) + col_index = IndexMap( + indices=tf.minimum(tf.cast(column_ids, tf.int32), self.config.max_num_columns - 1), + num_segments=self.config.max_num_columns, + batch_dims=1, + ) + cell_index = ProductIndexMap(row_index, col_index) + + # Masks. + input_shape = ( + shape_list(inputs["input_ids"]) + if inputs["input_ids"] is not None + else shape_list(inputs["inputs_embeds"])[:-1] + ) + if inputs["attention_mask"] is None: + inputs["attention_mask"] = tf.ones(input_shape) + # Table cells only, without question tokens and table headers. + if inputs["table_mask"] is None: + inputs["table_mask"] = tf.where(row_ids > 0, tf.ones_like(row_ids), tf.zeros_like(row_ids)) + # [batch_size, seq_length] + input_mask_float = tf.cast(inputs["attention_mask"], tf.float32) + table_mask_float = tf.cast(inputs["table_mask"], tf.float32) + + # Mask for cells that exist in the table (i.e. that are not padding). + cell_mask, _ = reduce_mean(input_mask_float, cell_index) + + # Compute logits per token. These are used to select individual cells. + logits = self.compute_token_logits(sequence_output) + + # Compute logits per column. These are used to select a column. + column_logits = None + if self.config.select_one_column: + column_logits = self.compute_column_logits( + sequence_output, cell_index, cell_mask, self.config.allow_empty_column_selection + ) + + # Aggregate logits. + logits_aggregation = None + if self.config.num_aggregation_labels > 0: + logits_aggregation = self.aggregation_classifier(pooled_output) + + # Total loss calculation + total_loss = 0.0 + calculate_loss = False + if inputs["labels"] is not None: + calculate_loss = True + is_supervised = not self.config.num_aggregation_labels > 0 or not self.config.use_answer_as_supervision + + # Semi-supervised cell selection in case of no aggregation: + # If the answer (the denotation) appears directly in the table we might + # select the answer without applying any aggregation function. There are + # some ambiguous cases, see utils._calculate_aggregate_mask for more info. + # `aggregate_mask` is 1 for examples where we chose to aggregate and 0 + # for examples where we chose to select the answer directly. + # `labels` encodes the positions of the answer appearing in the table. + if is_supervised: + aggregate_mask = None + else: + if inputs["float_answer"] is not None: + assert ( + shape_list(inputs["labels"])[0] == shape_list(inputs["float_answer"])[0] + ), "Make sure the answers are a FloatTensor of shape (batch_size,)" + # [batch_size] + aggregate_mask = _calculate_aggregate_mask( + inputs["float_answer"], + pooled_output, + self.config.cell_selection_preference, + inputs["labels"], + self.aggregation_classifier, + ) + else: + aggregate_mask = None + raise ValueError("You have to specify float answers in order to calculate the aggregate mask") + + # Cell selection log-likelihood + if self.config.average_logits_per_cell: + logits_per_cell, _ = reduce_mean(logits, cell_index) + logits = gather(logits_per_cell, cell_index) + dist_per_token = tfp.distributions.Bernoulli(logits=logits) + + # Compute cell selection loss per example. + selection_loss_per_example = None + if not self.config.select_one_column: + weight = tf.where( + inputs["labels"] == 0, + tf.ones_like(inputs["labels"], dtype=tf.float32), + self.config.positive_label_weight * tf.ones_like(inputs["labels"], dtype=tf.float32), + ) + selection_loss_per_token = -dist_per_token.log_prob(inputs["labels"]) * weight + selection_loss_per_example = tf.reduce_sum(selection_loss_per_token * input_mask_float, axis=1) / ( + tf.reduce_sum(input_mask_float, axis=1) + EPSILON_ZERO_DIVISION + ) + else: + selection_loss_per_example, logits = _single_column_cell_selection_loss( + logits, column_logits, inputs["labels"], cell_index, col_index, cell_mask + ) + dist_per_token = tfp.distributions.Bernoulli(logits=logits) + + # Supervised cell selection + if self.config.disable_per_token_loss: + pass + elif is_supervised: + total_loss += tf.reduce_mean(selection_loss_per_example) + else: + # For the not supervised case, do not assign loss for cell selection + total_loss += tf.reduce_mean(selection_loss_per_example * (1.0 - aggregate_mask)) + + # Semi-supervised regression loss and supervised loss for aggregations + if self.config.num_aggregation_labels > 0: + if is_supervised: + # Note that `aggregate_mask` is None if the setting is supervised. + if inputs["aggregation_labels"] is not None: + assert ( + shape_list(inputs["labels"])[0] == shape_list(inputs["aggregation_labels"])[0] + ), "Make sure the aggregation labels are a LongTensor of shape (batch_size,)" + per_example_additional_loss = _calculate_aggregation_loss( + logits_aggregation, + aggregate_mask, + inputs["aggregation_labels"], + self.config.use_answer_as_supervision, + self.config.num_aggregation_labels, + self.config.aggregation_loss_weight, + ) + else: + raise ValueError( + "You have to specify aggregation labels in order to calculate the aggregation loss" + ) + else: + aggregation_labels = tf.zeros(shape_list(inputs["labels"])[0], dtype=tf.int32) + per_example_additional_loss = _calculate_aggregation_loss( + logits_aggregation, + aggregate_mask, + aggregation_labels, + self.config.use_answer_as_supervision, + self.config.num_aggregation_labels, + self.config.aggregation_loss_weight, + ) + + if self.config.use_answer_as_supervision: + if inputs["numeric_values"] is not None and inputs["numeric_values_scale"] is not None: + assert shape_list(inputs["numeric_values"]) == shape_list(inputs["numeric_values_scale"]) + # Add regression loss for numeric answers which require aggregation. + answer_loss, large_answer_loss_mask = _calculate_regression_loss( + inputs["float_answer"], + aggregate_mask, + dist_per_token, + inputs["numeric_values"], + inputs["numeric_values_scale"], + table_mask_float, + logits_aggregation, + self.config, + ) + per_example_additional_loss += answer_loss + # Zero loss for examples with answer_loss > cutoff. + per_example_additional_loss *= large_answer_loss_mask + else: + raise ValueError( + "You have to specify numeric values and numeric values scale in order to calculate the regression loss" + ) + total_loss += tf.reduce_mean(per_example_additional_loss) + + else: + # if no label ids are provided, set them to zeros in order to properly compute logits + labels = tf.zeros_like(logits) + _, logits = _single_column_cell_selection_loss( + logits, column_logits, labels, cell_index, col_index, cell_mask + ) + if not inputs["return_dict"]: + output = (logits, logits_aggregation) + outputs[2:] + return ((total_loss,) + output) if calculate_loss else output + + return TFTableQuestionAnsweringOutput( + loss=total_loss if calculate_loss else None, + logits=logits, + logits_aggregation=logits_aggregation, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def serving_output(self, output: TFTableQuestionAnsweringOutput) -> TFTableQuestionAnsweringOutput: + hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None + attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None + + return TFTableQuestionAnsweringOutput( + logits=output.logits, logits_aggregation=output.logits_aggregation, hidden_states=hs, attentions=attns + ) + + +@add_start_docstrings( + """ + Tapas Model with a sequence classification head on top (a linear layer on top of the pooled output), e.g. for table + entailment tasks, such as TabFact (Chen et al., 2020). + """, + TAPAS_START_DOCSTRING, +) +class TFTapasForSequenceClassification(TFTapasPreTrainedModel, TFSequenceClassificationLoss): + def __init__(self, config: TapasConfig, *inputs, **kwargs): + super().__init__(config, *inputs, **kwargs) + self.num_labels = config.num_labels + + self.tapas = TFTapasMainLayer(config, name="tapas") + self.dropout = tf.keras.layers.Dropout(config.hidden_dropout_prob, name="dropout") + self.classifier = tf.keras.layers.Dense( + config.num_labels, kernel_initializer=get_initializer(config.initializer_range), name="classifier" + ) + + @add_start_docstrings_to_model_forward(TAPAS_INPUTS_DOCSTRING.format("batch_size, num_choices, sequence_length")) + @add_code_sample_docstrings( + processor_class=_TOKENIZER_FOR_DOC, + checkpoint=_CHECKPOINT_FOR_DOC, + output_type=TFSequenceClassifierOutput, + config_class=_CONFIG_FOR_DOC, + ) + def call( + self, + input_ids: Optional[TFModelInputType] = None, + attention_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + token_type_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + position_ids: Optional[Union[np.ndarray, tf.Tensor]] = None, + head_mask: Optional[Union[np.ndarray, tf.Tensor]] = None, + inputs_embeds: Optional[Union[np.ndarray, tf.Tensor]] = None, + output_attentions: Optional[bool] = None, + output_hidden_states: Optional[bool] = None, + return_dict: Optional[bool] = None, + labels: Optional[Union[np.ndarray, tf.Tensor]] = None, + training: Optional[bool] = False, + **kwargs, + ) -> Union[TFSequenceClassifierOutput, Tuple[tf.Tensor]]: + r""" + labels (:obj:`torch.LongTensor` of shape :obj:`(batch_size,)`, `optional`): + Labels for computing the sequence classification/regression loss. Indices should be in :obj:`[0, ..., + config.num_labels - 1]`. If :obj:`config.num_labels == 1` a regression loss is computed (Mean-Square loss), + If :obj:`config.num_labels > 1` a classification loss is computed (Cross-Entropy). Note: this is called + "classification_class_index" in the original implementation. + + Returns: + + Examples:: + + >>> from transformers import TapasTokenizer, TapasForSequenceClassification + >>> import tensorflow as tf + >>> import pandas as pd + + >>> tokenizer = TapasTokenizer.from_pretrained('google/tapas-base-finetuned-tabfact') + >>> model = TapasForSequenceClassification.from_pretrained('google/tapas-base-finetuned-tabfact') + + >>> data = {'Actors': ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], + ... 'Age': ["56", "45", "59"], + ... 'Number of movies': ["87", "53", "69"] + ... } + >>> table = pd.DataFrame.from_dict(data) + >>> queries = ["There is only one actor who is 45 years old", "There are 3 actors which played in more than 60 movies"] + + >>> inputs = tokenizer(table=table, queries=queries, padding="max_length", return_tensors="tf") + >>> labels = tf.convert_to_tensor([1, 0]) # 1 means entailed, 0 means refuted + + >>> outputs = model(**inputs, labels=labels) + >>> loss = outputs.loss + >>> logits = outputs.logits + """ + + inputs = input_processing( + func=self.call, + config=self.config, + input_ids=input_ids, + attention_mask=attention_mask, + token_type_ids=token_type_ids, + position_ids=position_ids, + head_mask=head_mask, + inputs_embeds=inputs_embeds, + output_attentions=output_attentions, + output_hidden_states=output_hidden_states, + return_dict=return_dict, + labels=labels, + training=training, + kwargs_call=kwargs, + ) + outputs = self.tapas( + input_ids=inputs["input_ids"], + attention_mask=inputs["attention_mask"], + token_type_ids=inputs["token_type_ids"], + position_ids=inputs["position_ids"], + head_mask=inputs["head_mask"], + inputs_embeds=inputs["inputs_embeds"], + output_attentions=inputs["output_attentions"], + output_hidden_states=inputs["output_hidden_states"], + return_dict=inputs["return_dict"], + training=inputs["training"], + ) + pooled_output = outputs[1] + pooled_output = self.dropout(inputs=pooled_output, training=inputs["training"]) + logits = self.classifier(inputs=pooled_output) + loss = None if inputs["labels"] is None else self.compute_loss(labels=inputs["labels"], logits=logits) + + if not inputs["return_dict"]: + output = (logits,) + outputs[2:] + return ((loss,) + output) if loss is not None else output + + return TFSequenceClassifierOutput( + loss=loss, + logits=logits, + hidden_states=outputs.hidden_states, + attentions=outputs.attentions, + ) + + def serving_output(self, output: TFSequenceClassifierOutput) -> TFSequenceClassifierOutput: + hs = tf.convert_to_tensor(output.hidden_states) if self.config.output_hidden_states else None + attns = tf.convert_to_tensor(output.attentions) if self.config.output_attentions else None + + return TFSequenceClassifierOutput(logits=output.logits, hidden_states=hs, attentions=attns) + + +""" TAPAS utilities.""" + + +class AverageApproximationFunction(str, enum.Enum): + RATIO = "ratio" + FIRST_ORDER = "first_order" + SECOND_ORDER = "second_order" + + +# Beginning of everything related to segmented tensors + + +class IndexMap(object): + """Index grouping entries within a tensor.""" + + def __init__(self, indices, num_segments, batch_dims=0): + """ + Creates an index. + + Args: + indices: Tensor of indices, same shape as `values`. + num_segments: Scalar tensor, the number of segments. All elements + in a batched segmented tensor must have the same number of segments (although many segments can be empty). + batch_dims: Python integer, the number of batch dimensions. The first + `batch_dims` dimensions of a SegmentedTensor are treated as batch dimensions. Segments in different batch + elements are always distinct even if they have the same index. + """ + self.indices = tf.convert_to_tensor(indices) + self.num_segments = tf.convert_to_tensor(num_segments) + self.batch_dims = batch_dims + + def batch_shape(self): + return tf.shape(self.indices)[: self.batch_dims] + + +class ProductIndexMap(IndexMap): + """The product of two indices.""" + + def __init__(self, outer_index, inner_index): + """ + Combines indices i and j into pairs (i, j). The result is an index where each segment (i, j) is the + intersection of segments i and j. For example if the inputs represent table cells indexed by respectively rows + and columns the output will be a table indexed by (row, column) pairs, i.e. by cell. The implementation + combines indices {0, .., n - 1} and {0, .., m - 1} into {0, .., nm - 1}. The output has `num_segments` equal to + `outer_index.num_segements` * `inner_index.num_segments`. + + Args: + outer_index: IndexMap. + inner_index: IndexMap, must have the same shape as `outer_index`. + """ + if outer_index.batch_dims != inner_index.batch_dims: + raise ValueError("outer_index.batch_dims and inner_index.batch_dims " "must be the same.") + + super(ProductIndexMap, self).__init__( + indices=(inner_index.indices + outer_index.indices * inner_index.num_segments), + num_segments=inner_index.num_segments * outer_index.num_segments, + batch_dims=inner_index.batch_dims, + ) + self.outer_index = outer_index + self.inner_index = inner_index + + def project_outer(self, index): + """Projects an index with the same index set onto the outer components.""" + return IndexMap( + indices=tf.math.floordiv(index.indices, self.inner_index.num_segments), + num_segments=self.outer_index.num_segments, + batch_dims=index.batch_dims, + ) + + def project_inner(self, index): + """Projects an index with the same index set onto the inner components.""" + return IndexMap( + indices=tf.math.floormod(index.indices, self.inner_index.num_segments), + num_segments=self.inner_index.num_segments, + batch_dims=index.batch_dims, + ) + + +def gather(values, index, name="segmented_gather"): + """ + Gathers from `values` using the index map. For each element in the domain of the index map this operation looks up + a value for that index in `values`. Two elements from the same segment always get assigned the same value. + + Args: + values: [B1, ..., Bn, num_segments, V1, ...] Tensor with segment values. + index: [B1, ..., Bn, I1, ..., Ik] IndexMap. + name: Name for the TensorFlow operation. + + Returns: + [B1, ..., Bn, I1, ..., Ik, V1, ...] Tensor with the gathered values. + """ + return tf.gather(values, index.indices, batch_dims=index.batch_dims, name=name) + + +def flatten(index, name="segmented_flatten"): + """ + Flattens a batched index map to a 1d index map. This operation relabels the segments to keep batch elements + distinct. The k-th batch element will have indices shifted by `num_segments` * (k - 1). The result is a tensor with + `num_segments` multiplied by the number of elements in the batch. + + Args: + index: IndexMap to flatten. + name: Name for the TensorFlow operation. + + Returns: + The flattened IndexMap. + """ + batch_size = tf.reduce_prod(index.batch_shape()) + offset = tf.range(batch_size) * index.num_segments + offset = tf.reshape(offset, index.batch_shape()) + for _ in range(index.batch_dims, index.indices.shape.rank): + offset = tf.expand_dims(offset, -1) + + indices = offset + index.indices + return IndexMap(indices=tf.reshape(indices, [-1]), num_segments=index.num_segments * batch_size, batch_dims=0) + + +def range_index_map(batch_shape, num_segments, name="range_index_map"): + """ + Constructs an index map equal to range(num_segments). + + Args: + batch_shape (:obj:`tf.Tensor`): + Batch shape + num_segments (:obj:`int`): + Number of segments + name (:obj:`str`, `optional`, defaults to 'range_index_map'): + Name for the operation. Currently not used + + Returns: + (:obj:`IndexMap`): IndexMap of shape batch_shape with elements equal to range(num_segments). + """ + batch_shape = tf.convert_to_tensor(batch_shape) + batch_shape.shape.assert_has_rank(1) + num_segments = tf.convert_to_tensor(num_segments) + num_segments.shape.assert_has_rank(0) + + indices = tf.range(num_segments) + shape = tf.concat([tf.ones_like(batch_shape, dtype=tf.int32), tf.expand_dims(num_segments, axis=0)], axis=0) + indices = tf.reshape(indices, shape) + multiples = tf.concat([batch_shape, [1]], axis=0) + indices = tf.tile(indices, multiples) + return IndexMap(indices=indices, num_segments=num_segments, batch_dims=batch_shape.shape.as_list()[0]) + + +def _segment_reduce(values, index, segment_reduce_fn, name): + """ + Applies a segment reduction segment-wise. + + Args: + values (:obj:`tf.Tensor`): + Tensor with segment values. + index (:obj:`IndexMap`): + IndexMap. + segment_reduce_fn (:obj:`str`): + Name for the reduce operation. One of "sum", "mean", "max" or "min". + name (:obj:`str`): + Name for the operation. Currently not used + + Returns: + (:obj:`IndexMap`): IndexMap of shape batch_shape with elements equal to range(num_segments). + """ + # Flatten the batch dimensions, as segments ops do not support batching. + # However if `values` has extra dimensions to the right keep them + # unflattened. Segmented ops support vector-valued operations. + flat_index = flatten(index) + vector_shape = tf.shape(values)[index.indices.shape.rank :] + flattened_shape = tf.concat([[-1], vector_shape], axis=0) + flat_values = tf.reshape(values, flattened_shape) + segment_means = segment_reduce_fn( + data=flat_values, segment_ids=flat_index.indices, num_segments=flat_index.num_segments + ) + + # Unflatten the values. + new_shape = tf.concat([index.batch_shape(), [index.num_segments], vector_shape], axis=0) + output_values = tf.reshape(segment_means, new_shape) + output_index = range_index_map(index.batch_shape(), index.num_segments) + return output_values, output_index + + +def reduce_mean(values, index, name="segmented_reduce_mean"): + """ + Averages a tensor over its segments. Outputs 0 for empty segments. This operations computes the mean over segments, + with support for: + + - Batching using the first dimensions [B1, B2, ..., Bn]. Each element in a batch can have different indices. + - Vectorization using the last dimension [V1, V2, ...]. If they are present the output will be a mean of vectors + rather than scalars. + Only the middle dimensions [I1, ..., Ik] are reduced by the operation. + + Args: + values: [B1, B2, ..., Bn, I1, .., Ik, V1, V2, ..] tensor of values to be + averaged. + index: IndexMap [B1, B2, ..., Bn, I1, .., Ik] index defining the segments. + name: Name for the TensorFlow ops. + + Returns: + A pair (output_values, output_index) where `output_values` is a tensor of shape [B1, B2, ..., Bn, num_segments, + V1, V2, ..] and `index` is an IndexMap with shape [B1, B2, ..., Bn, num_segments]. + """ + return _segment_reduce(values, index, tf.math.unsorted_segment_mean, name) + + +def reduce_sum(values, index, name="segmented_reduce_sum"): + """ + Sums a tensor over its segments. Outputs 0 for empty segments. This operations computes the sum over segments, with + support for: + + - Batching using the first dimensions [B1, B2, ..., Bn]. Each element in a batch can have different indices. + - Vectorization using the last dimension [V1, V2, ...]. If they are present the output will be a sum of vectors + rather than scalars. + Only the middle dimensions [I1, ..., Ik] are reduced by the operation. + + Args: + values: [B1, B2, ..., Bn, I1, .., Ik, V1, V2, ..] tensor of values to be + averaged. + index: IndexMap [B1, B2, ..., Bn, I1, .., Ik] index defining the segments. + name: Name for the TensorFlow ops. + + Returns: + A pair (output_values, output_index) where `output_values` is a tensor of shape [B1, B2, ..., Bn, num_segments, + V1, V2, ..] and `index` is an IndexMap with shape [B1, B2, ..., Bn, num_segments]. + """ + return _segment_reduce(values, index, tf.math.unsorted_segment_sum, name) + + +def reduce_max(values, index, name="segmented_reduce_max"): + """ + Computes the maximum over segments. This operations computes the maximum over segments, with support for: + + - Batching using the first dimensions [B1, B2, ..., Bn]. Each element in a batch can have different indices. + - Vectorization using the last dimension [V1, V2, ...]. If they are present the output will be an element-wise + maximum of vectors rather than scalars. + Only the middle dimensions [I1, ..., Ik] are reduced by the operation. + + Args: + values: [B1, B2, ..., Bn, I1, .., Ik, V1, V2, ..] tensor of values to be + averaged. + index: IndexMap [B1, B2, ..., Bn, I1, .., Ik] index defining the segments. + name: Name for the TensorFlow ops. + + Returns: + A pair (output_values, output_index) where `output_values` is a tensor of shape [B1, B2, ..., Bn, num_segments, + V1, V2, ..] and `index` is an IndexMap with shape [B1, B2, ..., Bn, num_segments]. + """ + return _segment_reduce(values, index, tf.math.unsorted_segment_max, name) + + +def reduce_min(values, index, name="segmented_reduce_min"): + """Computes the minimum over segments.""" + return _segment_reduce(values, index, tf.math.unsorted_segment_min, name) + + +def _single_column_cell_selection_loss(token_logits, column_logits, labels, cell_index, col_index, cell_mask): + """ + Computes the loss for cell selection constrained to a single column. The loss is a hierarchical log-likelihood. The + model first predicts a column and then selects cells within that column (conditioned on the column). Cells outside + the selected column are never selected. + + Args: + token_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`): + Tensor containing the logits per token. + column_logits (:obj:`tf.Tensor` of shape :obj:`(batch_size, max_num_cols)`): + Tensor containing the logits per column. + labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`): + Labels per token. + cell_index (:obj:`ProductIndexMap`): + Index that groups tokens into cells. + col_index (:obj:`IndexMap`): + Index that groups tokens into columns. + cell_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, max_num_rows * max_num_cols)`): + Mask for cells that exist in the table (i.e. that are not padding). + + Returns: + selection_loss_per_example (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`): Loss for each example. logits + (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`): New logits which are only allowed to select + cells in a single column. Logits outside of the most likely column according to `column_logits` will be set to + a very low value (such that the probabilities are 0). + """ + # First find the column we should select. We use the column with maximum + # number of selected cells. + labels_per_column, _ = reduce_sum(tf.cast(labels, tf.float32), col_index) + column_label = tf.argmax(labels_per_column, axis=-1, output_type=tf.int32) + # Check if there are no selected cells in the column. In that case the model + # should predict the special column id 0, which means "select nothing". + no_cell_selected = tf.equal(tf.reduce_max(labels_per_column, axis=-1), 0) + column_label = tf.where(no_cell_selected, tf.zeros_like(column_label), column_label) + + column_dist = tfp.distributions.Categorical(logits=column_logits) + column_loss_per_example = -column_dist.log_prob(column_label) + + # Reduce the labels and logits to per-cell from per-token. + logits_per_cell, _ = reduce_mean(token_logits, cell_index) + labels_per_cell, labels_index = reduce_max(tf.cast(labels, tf.int32), cell_index) + + # Mask for the selected column. + column_id_for_cells = cell_index.project_inner(labels_index).indices + column_mask = tf.cast(tf.equal(column_id_for_cells, tf.expand_dims(column_label, axis=1)), tf.float32) + + # Compute the log-likelihood for cells, but only for the selected column. + cell_dist = tfp.distributions.Bernoulli(logits=logits_per_cell) + cell_log_prob = cell_dist.log_prob(labels_per_cell) + cell_loss = -tf.reduce_sum(cell_log_prob * column_mask * cell_mask, axis=1) + # We need to normalize the loss by the number of cells in the column. + cell_loss /= tf.reduce_sum(column_mask * cell_mask, axis=1) + EPSILON_ZERO_DIVISION + + selection_loss_per_example = column_loss_per_example + selection_loss_per_example += tf.where(no_cell_selected, tf.zeros_like(selection_loss_per_example), cell_loss) + + # Set the probs outside the selected column (selected by the *model*) + # to 0. This ensures backwards compatibility with models that select + # cells from multiple columns. + selected_column_id = tf.argmax(column_logits, axis=-1, output_type=tf.int32) + selected_column_mask = tf.cast( + tf.equal(column_id_for_cells, tf.expand_dims(selected_column_id, axis=-1)), tf.float32 + ) + # Never select cells with the special column id 0. + selected_column_mask = tf.where( + tf.equal(column_id_for_cells, 0), tf.zeros_like(selected_column_mask), selected_column_mask + ) + logits_per_cell += CLOSE_ENOUGH_TO_LOG_ZERO * (1.0 - cell_mask * selected_column_mask) + logits = gather(logits_per_cell, cell_index) + + return selection_loss_per_example, logits + + +def _calculate_aggregate_mask(answer, pooled_output, cell_selection_preference, labels, aggregation_classifier): + """ + Finds examples where the model should select cells with no aggregation. + + Returns a mask that determines for which examples should the model select answers directly from the table, without + any aggregation function. If the answer is a piece of text the case is unambiguous as aggregation functions only + apply to numbers. If the answer is a number but does not appear in the table then we must use some aggregation + case. The ambiguous case is when the answer is a number that also appears in the table. In this case we use the + aggregation function probabilities predicted by the model to decide whether to select or aggregate. The threshold + for this is a hyperparameter `cell_selection_preference` + + Args: + answer (:obj:`tf.Tensor` of shape :obj:`(batch_size, )`): + Answer for every example in the batch. Nan if there is no scalar answer. + pooled_output (:obj:`tf.Tensor` of shape :obj:`(batch_size, hidden_size)`): + Output of the pooler (BertPooler) on top of the encoder layer. + cell_selection_preference (:obj:`float`): + Preference for cell selection in ambiguous cases. + labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, sequence_length)`): + Labels per token. aggregation_classifier (:obj:`torch.nn.Linear`): Aggregation head + + Returns: + aggregate_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`): A mask set to 1 for examples that should use + aggregation functions. + """ + # tf.Tensor(batch_size,) + aggregate_mask_init = tf.cast(tf.logical_not(tf.math.is_nan(answer)), tf.float32) + logits_aggregation = aggregation_classifier(pooled_output) + dist_aggregation = tfp.distributions.Categorical(logits=logits_aggregation) + # Index 0 corresponds to "no aggregation". + aggregation_ops_total_mass = tf.reduce_sum(dist_aggregation.probs_parameter()[:, 1:], axis=1) + # Cell selection examples according to current model. + is_pred_cell_selection = aggregation_ops_total_mass <= cell_selection_preference + # Examples with non-empty cell selection supervision. + is_cell_supervision_available = tf.reduce_sum(labels, axis=1) > 0 + aggregate_mask = tf.where( + tf.logical_and(is_pred_cell_selection, is_cell_supervision_available), + tf.zeros_like(aggregate_mask_init, dtype=tf.float32), + aggregate_mask_init, + ) + aggregate_mask = tf.stop_gradient(aggregate_mask) + return aggregate_mask + + +def _calculate_aggregation_loss_known( + logits_aggregation, aggregate_mask, aggregation_labels, use_answer_as_supervision, num_aggregation_labels +): + """ + Calculates aggregation loss when its type is known during training. + + In the weakly supervised setting, the only known information is that for cell selection examples, "no aggregation" + should be predicted. For other examples (those that require aggregation), no loss is accumulated. In the setting + where aggregation type is always known, standard cross entropy loss is accumulated for all examples + + Args: + logits_aggregation (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_aggregation_labels)`): + Logits per aggregation operation. + aggregate_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, )`): + A mask set to 1 for examples that should use aggregation functions. + aggregation_labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, )`): + Aggregation function id for every example in the batch. + use_answer_as_supervision (:obj:`bool`, `optional`): + Whether to use the answer as the only supervision for aggregation examples. + num_aggregation_labels (:obj:`int`, `optional`, defaults to 0): + The number of aggregation operators to predict. + + Returns: + aggregation_loss_known (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`): Aggregation loss (when its type is + known during training) per example. + """ + if use_answer_as_supervision: + # Prepare "no aggregation" targets for cell selection examples. + target_aggregation = tf.zeros_like(aggregate_mask, dtype=tf.int32) + else: + # Use aggregation supervision as the target. + target_aggregation = aggregation_labels + + one_hot_labels = tf.one_hot(target_aggregation, depth=num_aggregation_labels, dtype=tf.float32) + log_probs = tf.nn.log_softmax(logits_aggregation, axis=-1) + + # [batch_size] + per_example_aggregation_intermediate = -tf.reduce_sum(one_hot_labels * log_probs, axis=-1) + if use_answer_as_supervision: + # Accumulate loss only for examples requiring cell selection + # (no aggregation). + return per_example_aggregation_intermediate * (1 - aggregate_mask) + else: + return per_example_aggregation_intermediate + + +def _calculate_aggregation_loss_unknown(logits_aggregation, aggregate_mask): + """ + Calculates aggregation loss in the case of answer supervision. + + Args: + logits_aggregation (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_aggregation_labels)`): + Logits per aggregation operation. + aggregate_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, )`): + A mask set to 1 for examples that should use aggregation functions + + Returns: + aggregation_loss_unknown (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`): Aggregation loss (in case of answer + supervision) per example. + """ + dist_aggregation = tfp.distributions.Categorical(logits=logits_aggregation) + # Index 0 corresponds to "no aggregation". + aggregation_ops_total_mass = tf.reduce_sum(dist_aggregation.probs_parameter()[:, 1:], axis=1) + # Predict some aggregation in case of an answer that needs aggregation. + # This increases the probability of all aggregation functions, in a way + # similar to MML, but without considering whether the function gives the + # correct answer. + return -tf.math.log(aggregation_ops_total_mass) * aggregate_mask + + +def _calculate_aggregation_loss( + logits_aggregation, + aggregate_mask, + aggregation_labels, + use_answer_as_supervision, + num_aggregation_labels, + aggregation_loss_weight, +): + """ + Calculates the aggregation loss per example. + + Args: + logits_aggregation (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_aggregation_labels)`): + Logits per aggregation operation. + aggregate_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size, )`): + A mask set to 1 for examples that should use aggregation functions. + aggregation_labels (:obj:`tf.Tensor` of shape :obj:`(batch_size, )`): + Aggregation function id for every example in the batch. + use_answer_as_supervision (:obj:`bool`, `optional`): + Whether to use the answer as the only supervision for aggregation examples. + num_aggregation_labels (:obj:`int`, `optional`, defaults to 0): + The number of aggregation operators to predict. + aggregation_loss_weight (:obj:`float`, `optional`, defaults to 1.0): + Importance weight for the aggregation loss. + + Returns: + aggregation_loss (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`): Aggregation loss per example. + """ + per_example_aggregation_loss = _calculate_aggregation_loss_known( + logits_aggregation, aggregate_mask, aggregation_labels, use_answer_as_supervision, num_aggregation_labels + ) + + if use_answer_as_supervision: + # Add aggregation loss for numeric answers that need aggregation. + per_example_aggregation_loss += _calculate_aggregation_loss_unknown(logits_aggregation, aggregate_mask) + return aggregation_loss_weight * per_example_aggregation_loss + + +def _calculate_expected_result( + dist_per_cell, numeric_values, numeric_values_scale, input_mask_float, logits_aggregation, config +): + """ + Calculates the expected result given cell and aggregation probabilities. + + Args: + dist_per_cell (:obj:`tfp.distributions.Bernoulli`): + Cell selection distribution for each cell. + numeric_values (:obj:`tf.Tensor` of shape :obj:`(batch_size, seq_length)`): + Numeric values of every token. Nan for tokens which are not numeric values. + numeric_values_scale (:obj:`tf.Tensor` of shape :obj:`(batch_size, seq_length)`): + Scale of the numeric values of every token. + input_mask_float (:obj: `tf.Tensor` of shape :obj:`(batch_size, seq_length)`): + Mask for the table, without question tokens and table headers. + logits_aggregation (:obj:`tf.Tensor` of shape :obj:`(batch_size, num_aggregation_labels)`): + Logits per aggregation operation. + config (:class:`~transformers.TapasConfig`): + Model configuration class with all the hyperparameters of the model + + Returns: + expected_result (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`): The expected result per example. + """ + if config.use_gumbel_for_cells: + gumbel_dist = tfp.distributions.RelaxedBernoulli( + # The token logits where already divided by the temperature and used for + # computing cell selection errors so we need to multiply it again here + config.temperature, + logits=dist_per_cell.logits_parameter() * config.temperature, + ) + scaled_probability_per_cell = gumbel_dist.sample() + else: + scaled_probability_per_cell = dist_per_cell.probs_parameter() + + # [batch_size, seq_length] + scaled_probability_per_cell = (scaled_probability_per_cell / numeric_values_scale) * input_mask_float + count_result = tf.reduce_sum(scaled_probability_per_cell, axis=1) + numeric_values_masked = tf.where( + tf.math.is_nan(numeric_values), tf.zeros_like(numeric_values), numeric_values + ) # Mask non-numeric table values to zero. + sum_result = tf.reduce_sum(scaled_probability_per_cell * numeric_values_masked, axis=1) + avg_approximation = config.average_approximation_function + if avg_approximation == AverageApproximationFunction.RATIO: + average_result = sum_result / (count_result + EPSILON_ZERO_DIVISION) + elif avg_approximation == AverageApproximationFunction.FIRST_ORDER: + # The sum of all probabilities exept that correspond to other cells + ex = tf.reduce_sum(scaled_probability_per_cell, axis=1, keepdims=True) - scaled_probability_per_cell + 1 + average_result = tf.reduce_sum(numeric_values_masked * scaled_probability_per_cell / ex, axis=1) + elif avg_approximation == AverageApproximationFunction.SECOND_ORDER: + # The sum of all probabilities exept that correspond to other cells + ex = tf.reduce_sum(scaled_probability_per_cell, axis=1, keepdims=True) - scaled_probability_per_cell + 1 + pointwise_var = scaled_probability_per_cell * (1 - scaled_probability_per_cell) + var = tf.reduce_sum(pointwise_var, axis=1, keepdims=True) - pointwise_var + multiplier = (var / tf.math.square(ex) + 1) / ex + average_result = tf.reduce_sum(numeric_values_masked * scaled_probability_per_cell * multiplier, axis=1) + else: + raise ValueError("Invalid average_approximation_function: %s", config.average_approximation_function) + + if config.use_gumbel_for_aggregation: + gumbel_dist = tfp.distributions.RelaxedOneHotCategorical( + config.aggregation_temperature, logits=logits_aggregation[:, 1:] + ) + # [batch_size, num_aggregation_labels - 1] + aggregation_op_only_probs = gumbel_dist.sample() + else: + # [batch_size, num_aggregation_labels - 1] + aggregation_op_only_probs = tf.nn.softmax(logits_aggregation[:, 1:] / config.aggregation_temperature, axis=-1) + all_results = tf.concat( + [ + tf.expand_dims(sum_result, axis=1), + tf.expand_dims(average_result, axis=1), + tf.expand_dims(count_result, axis=1), + ], + axis=1, + ) + expected_result = tf.reduce_sum(all_results * aggregation_op_only_probs, axis=1) + return expected_result + + +def _calculate_regression_loss( + answer, + aggregate_mask, + dist_per_cell, + numeric_values, + numeric_values_scale, + input_mask_float, + logits_aggregation, + config, +): + """ + Calculates the regression loss per example. + + Args: + answer (:obj: `tf.Tensor` of shape :obj:`(batch_size,)`): + Answer for every example in the batch. Nan if there is no scalar answer. + aggregate_mask (:obj: `tf.Tensor` of shape :obj:`(batch_size,)`): + A mask set to 1 for examples that should use aggregation functions. + dist_per_cell (:obj:`torch.distributions.Bernoulli`): + Cell selection distribution for each cell. + numeric_values (:obj:`tf.Tensor` of shape :obj:`(batch_size, seq_length)`): + Numeric values of every token. Nan for tokens which are not numeric values. + numeric_values_scale (:obj:`tf.Tensor` of shape :obj:`(batch_size, seq_length)`): + Scale of the numeric values of every token. + input_mask_float (:obj: `tf.Tensor` of shape :obj:`(batch_size, seq_length)`): + Mask for the table, without question tokens and table headers. + logits_aggregation (:obj: `tf.Tensor` of shape :obj:`(batch_size, num_aggregation_labels)`): + Logits per aggregation operation. + config (:class:`~transformers.TapasConfig`): + Model configuration class with all the parameters of the model + + Returns: + per_example_answer_loss_scaled (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`): Scales answer loss for each + example in the batch. large_answer_loss_mask (:obj:`tf.Tensor` of shape :obj:`(batch_size,)`): A mask which is + 1 for examples for which their answer loss is larger than the answer_loss_cutoff. + """ + # float32 (batch_size,) + expected_result = _calculate_expected_result( + dist_per_cell, numeric_values, numeric_values_scale, input_mask_float, logits_aggregation, config + ) + + # [batch_size] + answer_masked = tf.where(tf.math.is_nan(answer), tf.zeros_like(answer), answer) + + if config.use_normalized_answer_loss: + normalizer = tf.stop_gradient( + tf.math.maximum(tf.math.abs(expected_result), tf.math.abs(answer_masked)) + EPSILON_ZERO_DIVISION + ) + normalized_answer_masked = answer_masked / normalizer + normalized_expected_result = expected_result / normalizer + per_example_answer_loss = tf.compat.v1.losses.huber_loss( + normalized_answer_masked * aggregate_mask, + normalized_expected_result * aggregate_mask, + delta=tf.cast(1.0, tf.float32), + reduction=tf.losses.Reduction.NONE, + ) + else: + per_example_answer_loss = tf.compat.v1.losses.huber_loss( + answer_masked * aggregate_mask, + expected_result * aggregate_mask, + delta=tf.cast(config.huber_loss_delta, tf.float32), + reduction=tf.losses.Reduction.NONE, + ) + if config.answer_loss_cutoff is None: + large_answer_loss_mask = tf.ones_like(per_example_answer_loss, dtype=tf.float32) + else: + large_answer_loss_mask = tf.where( + per_example_answer_loss > config.answer_loss_cutoff, + tf.zeros_like(per_example_answer_loss, dtype=tf.float32), + tf.ones_like(per_example_answer_loss, dtype=tf.float32), + ) + per_example_answer_loss_scaled = config.answer_loss_importance * (per_example_answer_loss * aggregate_mask) + return per_example_answer_loss_scaled, large_answer_loss_mask diff --git a/src/transformers/models/tapas/tokenization_tapas.py b/src/transformers/models/tapas/tokenization_tapas.py index 3bd7a00d35..341d32ee19 100644 --- a/src/transformers/models/tapas/tokenization_tapas.py +++ b/src/transformers/models/tapas/tokenization_tapas.py @@ -1897,9 +1897,9 @@ class TapasTokenizer(PreTrainedTokenizer): data (:obj:`dict`): Dictionary mapping features to actual values. Should be created using :class:`~transformers.TapasTokenizer`. - logits (:obj:`np.ndarray` of shape ``(batch_size, sequence_length)``): + logits (:obj:`torch.Tensor` or :obj:`tf.Tensor` of shape ``(batch_size, sequence_length)``): Tensor containing the logits at the token level. - logits_agg (:obj:`np.ndarray` of shape ``(batch_size, num_aggregation_labels)``, `optional`): + logits_agg (:obj:`torch.Tensor` or :obj:`tf.Tensor` of shape ``(batch_size, num_aggregation_labels)``, `optional`): Tensor containing the aggregation logits. cell_classification_threshold (:obj:`float`, `optional`, defaults to 0.5): Threshold to be used for cell selection. All table cells for which their probability is larger than @@ -1915,6 +1915,11 @@ class TapasTokenizer(PreTrainedTokenizer): - predicted_aggregation_indices (``List[int]``of length ``batch_size``, `optional`, returned when ``logits_aggregation`` is provided): Predicted aggregation operator indices of the aggregation head. """ + # converting to numpy arrays to work with PT/TF + logits = logits.numpy() + if logits_agg is not None: + logits_agg = logits_agg.numpy() + data = {key: value.numpy() for key, value in data.items() if key != "training"} # input data is of type float32 # np.log(np.finfo(np.float32).max) = 88.72284 # Any value over 88.72284 will overflow when passed through the exponential, sending a warning @@ -1975,7 +1980,7 @@ class TapasTokenizer(PreTrainedTokenizer): output = (predicted_answer_coordinates,) if logits_agg is not None: - predicted_aggregation_indices = logits_agg.argmax(dim=-1) + predicted_aggregation_indices = logits_agg.argmax(axis=-1) output = (predicted_answer_coordinates, predicted_aggregation_indices.tolist()) return output diff --git a/src/transformers/pipelines/__init__.py b/src/transformers/pipelines/__init__.py index afbd41e615..90c718780d 100755 --- a/src/transformers/pipelines/__init__.py +++ b/src/transformers/pipelines/__init__.py @@ -78,6 +78,7 @@ if is_tf_available(): TFAutoModelForQuestionAnswering, TFAutoModelForSeq2SeqLM, TFAutoModelForSequenceClassification, + TFAutoModelForTableQuestionAnswering, TFAutoModelForTokenClassification, ) @@ -170,7 +171,7 @@ SUPPORTED_TASKS = { "table-question-answering": { "impl": TableQuestionAnsweringPipeline, "pt": (AutoModelForTableQuestionAnswering,) if is_torch_available() else (), - "tf": (), + "tf": (TFAutoModelForTableQuestionAnswering,) if is_tf_available() else (), "default": { "model": { "pt": "google/tapas-base-finetuned-wtq", diff --git a/src/transformers/pipelines/table_question_answering.py b/src/transformers/pipelines/table_question_answering.py index 7697752b2b..1ec93d1160 100644 --- a/src/transformers/pipelines/table_question_answering.py +++ b/src/transformers/pipelines/table_question_answering.py @@ -2,7 +2,13 @@ import collections import numpy as np -from ..file_utils import add_end_docstrings, is_torch_available, requires_backends +from ..file_utils import ( + add_end_docstrings, + is_tensorflow_probability_available, + is_tf_available, + is_torch_available, + requires_backends, +) from .base import PIPELINE_INIT_ARGS, ArgumentHandler, Pipeline, PipelineException @@ -11,6 +17,13 @@ if is_torch_available(): from ..models.auto.modeling_auto import MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING +if is_tf_available() and is_tensorflow_probability_available(): + import tensorflow as tf + + import tensorflow_probability as tfp + + from ..models.auto.modeling_tf_auto import TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING + class TableQuestionAnsweringArgumentHandler(ArgumentHandler): """ @@ -83,10 +96,11 @@ class TableQuestionAnsweringPipeline(Pipeline): super().__init__(*args, **kwargs) self._args_parser = args_parser - if self.framework == "tf": - raise ValueError("The TableQuestionAnsweringPipeline is only available in PyTorch.") - - self.check_model_type(MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING) + self.check_model_type( + TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING + if self.framework == "tf" + else MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING + ) self.aggregate = bool(getattr(self.model.config, "aggregation_labels")) and bool( getattr(self.model.config, "num_aggregation_labels") @@ -100,67 +114,129 @@ class TableQuestionAnsweringPipeline(Pipeline): Inference used for models that need to process sequences in a sequential fashion, like the SQA models which handle conversational query related to a table. """ - all_logits = [] - all_aggregations = [] - prev_answers = None - batch_size = inputs["input_ids"].shape[0] + if self.framework == "pt": + all_logits = [] + all_aggregations = [] + prev_answers = None + batch_size = inputs["input_ids"].shape[0] - input_ids = inputs["input_ids"].to(self.device) - attention_mask = inputs["attention_mask"].to(self.device) - token_type_ids = inputs["token_type_ids"].to(self.device) - token_type_ids_example = None + input_ids = inputs["input_ids"].to(self.device) + attention_mask = inputs["attention_mask"].to(self.device) + token_type_ids = inputs["token_type_ids"].to(self.device) + token_type_ids_example = None - for index in range(batch_size): - # If sequences have already been processed, the token type IDs will be created according to the previous - # answer. - if prev_answers is not None: - prev_labels_example = token_type_ids_example[:, 3] # shape (seq_len,) - model_labels = np.zeros_like(prev_labels_example.cpu().numpy()) # shape (seq_len,) + for index in range(batch_size): + # If sequences have already been processed, the token type IDs will be created according to the previous + # answer. + if prev_answers is not None: + prev_labels_example = token_type_ids_example[:, 3] # shape (seq_len,) + model_labels = np.zeros_like(prev_labels_example.cpu().numpy()) # shape (seq_len,) + token_type_ids_example = token_type_ids[index] # shape (seq_len, 7) + for i in range(model_labels.shape[0]): + segment_id = token_type_ids_example[:, 0].tolist()[i] + col_id = token_type_ids_example[:, 1].tolist()[i] - 1 + row_id = token_type_ids_example[:, 2].tolist()[i] - 1 + + if row_id >= 0 and col_id >= 0 and segment_id == 1: + model_labels[i] = int(prev_answers[(col_id, row_id)]) + + token_type_ids_example[:, 3] = torch.from_numpy(model_labels).type(torch.long).to(self.device) + + input_ids_example = input_ids[index] + attention_mask_example = attention_mask[index] # shape (seq_len,) token_type_ids_example = token_type_ids[index] # shape (seq_len, 7) - for i in range(model_labels.shape[0]): + outputs = self.model( + input_ids=input_ids_example.unsqueeze(0), + attention_mask=attention_mask_example.unsqueeze(0), + token_type_ids=token_type_ids_example.unsqueeze(0), + ) + logits = outputs.logits + + if self.aggregate: + all_aggregations.append(outputs.logits_aggregation) + + all_logits.append(logits) + + dist_per_token = torch.distributions.Bernoulli(logits=logits) + probabilities = dist_per_token.probs * attention_mask_example.type(torch.float32).to( + dist_per_token.probs.device + ) + + coords_to_probs = collections.defaultdict(list) + for i, p in enumerate(probabilities.squeeze().tolist()): segment_id = token_type_ids_example[:, 0].tolist()[i] - col_id = token_type_ids_example[:, 1].tolist()[i] - 1 - row_id = token_type_ids_example[:, 2].tolist()[i] - 1 + col = token_type_ids_example[:, 1].tolist()[i] - 1 + row = token_type_ids_example[:, 2].tolist()[i] - 1 + if col >= 0 and row >= 0 and segment_id == 1: + coords_to_probs[(col, row)].append(p) - if row_id >= 0 and col_id >= 0 and segment_id == 1: - model_labels[i] = int(prev_answers[(col_id, row_id)]) + prev_answers = {key: np.array(coords_to_probs[key]).mean() > 0.5 for key in coords_to_probs} - token_type_ids_example[:, 3] = torch.from_numpy(model_labels).type(torch.long).to(self.device) + logits_batch = torch.cat(tuple(all_logits), 0) - input_ids_example = input_ids[index] - attention_mask_example = attention_mask[index] # shape (seq_len,) - token_type_ids_example = token_type_ids[index] # shape (seq_len, 7) - outputs = self.model( - input_ids=input_ids_example.unsqueeze(0), - attention_mask=attention_mask_example.unsqueeze(0), - token_type_ids=token_type_ids_example.unsqueeze(0), - ) - logits = outputs.logits + return (logits_batch,) if not self.aggregate else (logits_batch, torch.cat(tuple(all_aggregations), 0)) + else: + all_logits = [] + all_aggregations = [] + prev_answers = None + batch_size = inputs["input_ids"].shape[0] - if self.aggregate: - all_aggregations.append(outputs.logits_aggregation) + input_ids = inputs["input_ids"] + attention_mask = inputs["attention_mask"] + token_type_ids = inputs["token_type_ids"].numpy() + token_type_ids_example = None - all_logits.append(logits) + for index in range(batch_size): + # If sequences have already been processed, the token type IDs will be created according to the previous + # answer. + if prev_answers is not None: + prev_labels_example = token_type_ids_example[:, 3] # shape (seq_len,) + model_labels = np.zeros_like(prev_labels_example, dtype=np.int32) # shape (seq_len,) - dist_per_token = torch.distributions.Bernoulli(logits=logits) - probabilities = dist_per_token.probs * attention_mask_example.type(torch.float32).to( - dist_per_token.probs.device - ) + token_type_ids_example = token_type_ids[index] # shape (seq_len, 7) + for i in range(model_labels.shape[0]): + segment_id = token_type_ids_example[:, 0].tolist()[i] + col_id = token_type_ids_example[:, 1].tolist()[i] - 1 + row_id = token_type_ids_example[:, 2].tolist()[i] - 1 - coords_to_probs = collections.defaultdict(list) - for i, p in enumerate(probabilities.squeeze().tolist()): - segment_id = token_type_ids_example[:, 0].tolist()[i] - col = token_type_ids_example[:, 1].tolist()[i] - 1 - row = token_type_ids_example[:, 2].tolist()[i] - 1 - if col >= 0 and row >= 0 and segment_id == 1: - coords_to_probs[(col, row)].append(p) + if row_id >= 0 and col_id >= 0 and segment_id == 1: + model_labels[i] = int(prev_answers[(col_id, row_id)]) - prev_answers = {key: np.array(coords_to_probs[key]).mean() > 0.5 for key in coords_to_probs} + token_type_ids_example[:, 3] = model_labels - logits_batch = torch.cat(tuple(all_logits), 0) + input_ids_example = input_ids[index] + attention_mask_example = attention_mask[index] # shape (seq_len,) + token_type_ids_example = token_type_ids[index] # shape (seq_len, 7) + outputs = self.model( + input_ids=np.expand_dims(input_ids_example, axis=0), + attention_mask=np.expand_dims(attention_mask_example, axis=0), + token_type_ids=np.expand_dims(token_type_ids_example, axis=0), + ) + logits = outputs.logits - return (logits_batch,) if not self.aggregate else (logits_batch, torch.cat(tuple(all_aggregations), 0)) + if self.aggregate: + all_aggregations.append(outputs.logits_aggregation) + + all_logits.append(logits) + + dist_per_token = tfp.distributions.Bernoulli(logits=logits) + probabilities = dist_per_token.probs_parameter() * tf.cast(attention_mask_example, tf.float32) + + coords_to_probs = collections.defaultdict(list) + token_type_ids_example = token_type_ids_example + for i, p in enumerate(tf.squeeze(probabilities).numpy().tolist()): + segment_id = token_type_ids_example[:, 0].tolist()[i] + col = token_type_ids_example[:, 1].tolist()[i] - 1 + row = token_type_ids_example[:, 2].tolist()[i] - 1 + if col >= 0 and row >= 0 and segment_id == 1: + coords_to_probs[(col, row)].append(p) + + prev_answers = {key: np.array(coords_to_probs[key]).mean() > 0.5 for key in coords_to_probs} + + logits_batch = tf.concat(tuple(all_logits), 0) + + return (logits_batch,) if not self.aggregate else (logits_batch, tf.concat(tuple(all_aggregations), 0)) def __call__(self, *args, **kwargs): r""" @@ -274,7 +350,7 @@ class TableQuestionAnsweringPipeline(Pipeline): outputs = model_outputs["outputs"] if self.aggregate: logits, logits_agg = outputs[:2] - predictions = self.tokenizer.convert_logits_to_predictions(inputs, logits.detach(), logits_agg) + predictions = self.tokenizer.convert_logits_to_predictions(inputs, logits, logits_agg) answer_coordinates_batch, agg_predictions = predictions aggregators = {i: self.model.config.aggregation_labels[pred] for i, pred in enumerate(agg_predictions)} @@ -284,7 +360,7 @@ class TableQuestionAnsweringPipeline(Pipeline): } else: logits = outputs[0] - predictions = self.tokenizer.convert_logits_to_predictions(inputs, logits.detach()) + predictions = self.tokenizer.convert_logits_to_predictions(inputs, logits) answer_coordinates_batch = predictions[0] aggregators = {} aggregators_prefix = {} diff --git a/src/transformers/testing_utils.py b/src/transformers/testing_utils.py index bbfdaedfb7..eebbe737e4 100644 --- a/src/transformers/testing_utils.py +++ b/src/transformers/testing_utils.py @@ -44,6 +44,7 @@ from .file_utils import ( is_scatter_available, is_sentencepiece_available, is_soundfile_availble, + is_tensorflow_probability_available, is_tf_available, is_timm_available, is_tokenizers_available, @@ -292,6 +293,19 @@ def require_torch_scatter(test_case): return test_case +def require_tensorflow_probability(test_case): + """ + Decorator marking a test that requires TensorFlow probability. + + These tests are skipped when TensorFlow probability isn't installed. + + """ + if not is_tensorflow_probability_available(): + return unittest.skip("test requires TensorFlow probability")(test_case) + else: + return test_case + + def require_torchaudio(test_case): """ Decorator marking a test that requires torchaudio. These tests are skipped when torchaudio isn't installed. diff --git a/src/transformers/utils/dummy_tf_objects.py b/src/transformers/utils/dummy_tf_objects.py index 14991e8b6a..ca564457a2 100644 --- a/src/transformers/utils/dummy_tf_objects.py +++ b/src/transformers/utils/dummy_tf_objects.py @@ -239,6 +239,9 @@ TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING = None TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING = None +TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING = None + + TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING = None @@ -356,6 +359,18 @@ class TFAutoModelForSequenceClassification: requires_backends(self, ["tf"]) +class TFAutoModelForTableQuestionAnswering: + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["tf"]) + + def call(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + class TFAutoModelForTokenClassification: def __init__(self, *args, **kwargs): requires_backends(self, ["tf"]) @@ -2488,6 +2503,69 @@ class TFT5PreTrainedModel: requires_backends(self, ["tf"]) +TF_TAPAS_PRETRAINED_MODEL_ARCHIVE_LIST = None + + +class TFTapasForMaskedLM: + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["tf"]) + + def call(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + +class TFTapasForQuestionAnswering: + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["tf"]) + + def call(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + +class TFTapasForSequenceClassification: + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["tf"]) + + def call(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + +class TFTapasModel: + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["tf"]) + + def call(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + +class TFTapasPreTrainedModel: + def __init__(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + @classmethod + def from_pretrained(cls, *args, **kwargs): + requires_backends(cls, ["tf"]) + + def call(self, *args, **kwargs): + requires_backends(self, ["tf"]) + + TF_TRANSFO_XL_PRETRAINED_MODEL_ARCHIVE_LIST = None diff --git a/tests/test_modeling_tf_auto.py b/tests/test_modeling_tf_auto.py index bad3353cc5..bc3a7a6a87 100644 --- a/tests/test_modeling_tf_auto.py +++ b/tests/test_modeling_tf_auto.py @@ -17,8 +17,14 @@ import copy import tempfile import unittest -from transformers import CONFIG_MAPPING, AutoConfig, BertConfig, GPT2Config, T5Config, is_tf_available -from transformers.testing_utils import DUMMY_UNKNOWN_IDENTIFIER, SMALL_MODEL_IDENTIFIER, require_tf, slow +from transformers import CONFIG_MAPPING, AutoConfig, BertConfig, GPT2Config, T5Config, TapasConfig, is_tf_available +from transformers.testing_utils import ( + DUMMY_UNKNOWN_IDENTIFIER, + SMALL_MODEL_IDENTIFIER, + require_tensorflow_probability, + require_tf, + slow, +) from .test_modeling_bert import BertModelTester @@ -32,6 +38,7 @@ if is_tf_available(): TFAutoModelForQuestionAnswering, TFAutoModelForSeq2SeqLM, TFAutoModelForSequenceClassification, + TFAutoModelForTableQuestionAnswering, TFAutoModelForTokenClassification, TFAutoModelWithLMHead, TFBertForMaskedLM, @@ -44,6 +51,7 @@ if is_tf_available(): TFGPT2LMHeadModel, TFRobertaForMaskedLM, TFT5ForConditionalGeneration, + TFTapasForQuestionAnswering, ) from transformers.models.auto.modeling_tf_auto import ( TF_MODEL_FOR_CAUSAL_LM_MAPPING, @@ -52,6 +60,7 @@ if is_tf_available(): TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING, TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, + TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING, TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, TF_MODEL_MAPPING, TF_MODEL_WITH_LM_HEAD_MAPPING, @@ -59,6 +68,7 @@ if is_tf_available(): from transformers.models.bert.modeling_tf_bert import TF_BERT_PRETRAINED_MODEL_ARCHIVE_LIST from transformers.models.gpt2.modeling_tf_gpt2 import TF_GPT2_PRETRAINED_MODEL_ARCHIVE_LIST from transformers.models.t5.modeling_tf_t5 import TF_T5_PRETRAINED_MODEL_ARCHIVE_LIST + from transformers.models.tapas.modeling_tf_tapas import TF_TAPAS_PRETRAINED_MODEL_ARCHIVE_LIST class NewModelConfig(BertConfig): @@ -176,6 +186,21 @@ class TFAutoModelTest(unittest.TestCase): self.assertIsNotNone(model) self.assertIsInstance(model, TFBertForQuestionAnswering) + @slow + @require_tensorflow_probability + def test_table_question_answering_model_from_pretrained(self): + for model_name in TF_TAPAS_PRETRAINED_MODEL_ARCHIVE_LIST[5:6]: + config = AutoConfig.from_pretrained(model_name) + self.assertIsNotNone(config) + self.assertIsInstance(config, TapasConfig) + + model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_name) + model, loading_info = TFAutoModelForTableQuestionAnswering.from_pretrained( + model_name, output_loading_info=True + ) + self.assertIsNotNone(model) + self.assertIsInstance(model, TFTapasForQuestionAnswering) + def test_from_pretrained_identifier(self): model = TFAutoModelWithLMHead.from_pretrained(SMALL_MODEL_IDENTIFIER) self.assertIsInstance(model, TFBertForMaskedLM) @@ -210,6 +235,7 @@ class TFAutoModelTest(unittest.TestCase): TF_MODEL_MAPPING, TF_MODEL_FOR_PRETRAINING_MAPPING, TF_MODEL_FOR_QUESTION_ANSWERING_MAPPING, + TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING, TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, TF_MODEL_WITH_LM_HEAD_MAPPING, diff --git a/tests/test_modeling_tf_tapas.py b/tests/test_modeling_tf_tapas.py new file mode 100644 index 0000000000..ca44781973 --- /dev/null +++ b/tests/test_modeling_tf_tapas.py @@ -0,0 +1,1036 @@ +# coding=utf-8 +# Copyright 2021 The HuggingFace Inc. team. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import copy +import unittest + +import numpy as np +import pandas as pd + +from transformers import ( + TF_MODEL_FOR_CAUSAL_LM_MAPPING, + TF_MODEL_FOR_MASKED_LM_MAPPING, + TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING, + TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING, + TF_MODEL_FOR_PRETRAINING_MAPPING, + TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING, + TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING, + TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING, + TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING, + TapasConfig, + TapasTokenizer, + is_tf_available, +) +from transformers.file_utils import cached_property +from transformers.models.auto import get_values +from transformers.testing_utils import require_tensorflow_probability, require_tf, slow + +from .test_configuration_common import ConfigTester +from .test_modeling_tf_common import TFModelTesterMixin, ids_tensor + + +if is_tf_available(): + import tensorflow as tf + + from transformers import ( + TFTapasForMaskedLM, + TFTapasForQuestionAnswering, + TFTapasForSequenceClassification, + TFTapasModel, + ) + from transformers.models.tapas.modeling_tf_tapas import ( + IndexMap, + ProductIndexMap, + flatten, + gather, + range_index_map, + reduce_max, + reduce_mean, + reduce_sum, + ) + + +class TFTapasModelTester: + def __init__( + self, + parent, + batch_size=13, + seq_length=7, + is_training=True, + use_input_mask=True, + use_token_type_ids=True, + use_labels=True, + vocab_size=99, + hidden_size=32, + num_hidden_layers=5, + num_attention_heads=4, + intermediate_size=37, + hidden_act="gelu", + hidden_dropout_prob=0.1, + attention_probs_dropout_prob=0.1, + initializer_range=0.02, + max_position_embeddings=512, + type_vocab_sizes=[3, 256, 256, 2, 256, 256, 10], + type_sequence_label_size=2, + positive_weight=10.0, + num_aggregation_labels=4, + num_labels=2, + aggregation_loss_importance=0.8, + use_answer_as_supervision=True, + answer_loss_importance=0.001, + use_normalized_answer_loss=False, + huber_loss_delta=25.0, + temperature=1.0, + agg_temperature=1.0, + use_gumbel_for_cells=False, + use_gumbel_for_agg=False, + average_approximation_function="ratio", + cell_selection_preference=0.5, + answer_loss_cutoff=100, + max_num_rows=64, + max_num_columns=32, + average_logits_per_cell=True, + select_one_column=True, + allow_empty_column_selection=False, + init_cell_selection_weights_to_zero=True, + reset_position_index_per_cell=True, + disable_per_token_loss=False, + scope=None, + ): + self.parent = parent + self.batch_size = batch_size + self.seq_length = seq_length + self.is_training = is_training + self.use_input_mask = use_input_mask + self.use_token_type_ids = use_token_type_ids + self.use_labels = use_labels + self.vocab_size = vocab_size + self.hidden_size = hidden_size + self.num_hidden_layers = num_hidden_layers + self.num_attention_heads = num_attention_heads + self.intermediate_size = intermediate_size + self.hidden_act = hidden_act + self.hidden_dropout_prob = hidden_dropout_prob + self.attention_probs_dropout_prob = attention_probs_dropout_prob + self.initializer_range = initializer_range + self.max_position_embeddings = max_position_embeddings + self.type_vocab_sizes = type_vocab_sizes + self.type_sequence_label_size = type_sequence_label_size + self.positive_weight = positive_weight + self.num_aggregation_labels = num_aggregation_labels + self.num_labels = num_labels + self.aggregation_loss_importance = aggregation_loss_importance + self.use_answer_as_supervision = use_answer_as_supervision + self.answer_loss_importance = answer_loss_importance + self.use_normalized_answer_loss = use_normalized_answer_loss + self.huber_loss_delta = huber_loss_delta + self.temperature = temperature + self.agg_temperature = agg_temperature + self.use_gumbel_for_cells = use_gumbel_for_cells + self.use_gumbel_for_agg = use_gumbel_for_agg + self.average_approximation_function = average_approximation_function + self.cell_selection_preference = cell_selection_preference + self.answer_loss_cutoff = answer_loss_cutoff + self.max_num_rows = max_num_rows + self.max_num_columns = max_num_columns + self.average_logits_per_cell = average_logits_per_cell + self.select_one_column = select_one_column + self.allow_empty_column_selection = allow_empty_column_selection + self.init_cell_selection_weights_to_zero = init_cell_selection_weights_to_zero + self.reset_position_index_per_cell = reset_position_index_per_cell + self.disable_per_token_loss = disable_per_token_loss + self.scope = scope + + def prepare_config_and_inputs(self): + input_ids = ids_tensor([self.batch_size, self.seq_length], self.vocab_size) + + input_mask = None + if self.use_input_mask: + input_mask = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) + + token_type_ids = [] + for type_vocab_size in self.type_vocab_sizes: + token_type_ids.append(ids_tensor(shape=[self.batch_size, self.seq_length], vocab_size=type_vocab_size)) + token_type_ids = tf.stack(token_type_ids, axis=2) + + sequence_labels = None + token_labels = None + labels = None + numeric_values = None + numeric_values_scale = None + float_answer = None + aggregation_labels = None + if self.use_labels: + sequence_labels = ids_tensor([self.batch_size], self.type_sequence_label_size) + token_labels = ids_tensor([self.batch_size, self.seq_length], self.num_labels) + labels = ids_tensor([self.batch_size, self.seq_length], vocab_size=2) + numeric_values = ids_tensor([self.batch_size, self.seq_length], vocab_size=2, dtype=tf.float32) + numeric_values_scale = ids_tensor([self.batch_size, self.seq_length], vocab_size=2, dtype=tf.float32) + float_answer = ids_tensor([self.batch_size], vocab_size=2, dtype=tf.float32) + aggregation_labels = ids_tensor([self.batch_size], self.num_aggregation_labels) + + config = self.get_config() + + return ( + config, + input_ids, + input_mask, + token_type_ids, + sequence_labels, + token_labels, + labels, + numeric_values, + numeric_values_scale, + float_answer, + aggregation_labels, + ) + + def get_config(self): + return TapasConfig( + vocab_size=self.vocab_size, + hidden_size=self.hidden_size, + num_hidden_layers=self.num_hidden_layers, + num_attention_heads=self.num_attention_heads, + intermediate_size=self.intermediate_size, + hidden_act=self.hidden_act, + hidden_dropout_prob=self.hidden_dropout_prob, + attention_probs_dropout_prob=self.attention_probs_dropout_prob, + max_position_embeddings=self.max_position_embeddings, + type_vocab_sizes=self.type_vocab_sizes, + initializer_range=self.initializer_range, + positive_weight=self.positive_weight, + num_aggregation_labels=self.num_aggregation_labels, + num_labels=self.num_labels, + aggregation_loss_importance=self.aggregation_loss_importance, + use_answer_as_supervision=self.use_answer_as_supervision, + answer_loss_importance=self.answer_loss_importance, + use_normalized_answer_loss=self.use_normalized_answer_loss, + huber_loss_delta=self.huber_loss_delta, + temperature=self.temperature, + agg_temperature=self.agg_temperature, + use_gumbel_for_cells=self.use_gumbel_for_cells, + use_gumbel_for_agg=self.use_gumbel_for_agg, + average_approximation_function=self.average_approximation_function, + cell_selection_preference=self.cell_selection_preference, + answer_loss_cutoff=self.answer_loss_cutoff, + max_num_rows=self.max_num_rows, + max_num_columns=self.max_num_columns, + average_logits_per_cell=self.average_logits_per_cell, + select_one_column=self.select_one_column, + allow_empty_column_selection=self.allow_empty_column_selection, + init_cell_selection_weights_to_zero=self.init_cell_selection_weights_to_zero, + reset_position_index_per_cell=self.reset_position_index_per_cell, + disable_per_token_loss=self.disable_per_token_loss, + ) + + def create_and_check_model( + self, + config, + input_ids, + input_mask, + token_type_ids, + sequence_labels, + token_labels, + labels, + numeric_values, + numeric_values_scale, + float_answer, + aggregation_labels, + ): + model = TFTapasModel(config=config) + + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + } + result = model(inputs) + inputs.pop("attention_mask") + result = model(inputs) + inputs.pop("token_type_ids") + result = model(inputs) + + self.parent.assertEqual(result.last_hidden_state.shape, (self.batch_size, self.seq_length, self.hidden_size)) + self.parent.assertEqual(result.pooler_output.shape, (self.batch_size, self.hidden_size)) + + def create_and_check_for_masked_lm( + self, + config, + input_ids, + input_mask, + token_type_ids, + sequence_labels, + token_labels, + labels, + numeric_values, + numeric_values_scale, + float_answer, + aggregation_labels, + ): + model = TFTapasForMaskedLM(config=config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + "labels": token_labels, + } + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length, self.vocab_size)) + + def create_and_check_for_sequence_classification( + self, + config, + input_ids, + input_mask, + token_type_ids, + sequence_labels, + token_labels, + labels, + numeric_values, + numeric_values_scale, + float_answer, + aggregation_labels, + ): + config.num_labels = self.num_labels + model = TFTapasForSequenceClassification(config=config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "labels": sequence_labels, + } + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.num_labels)) + + def create_and_check_for_question_answering( + self, + config, + input_ids, + input_mask, + token_type_ids, + sequence_labels, + token_labels, + labels, + numeric_values, + numeric_values_scale, + float_answer, + aggregation_labels, + ): + # inference: without aggregation head (SQA). Model only returns logits + sqa_config = copy.copy(config) + sqa_config.num_aggregation_labels = 0 + sqa_config.use_answer_as_supervision = False + model = TFTapasForQuestionAnswering(config=sqa_config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + } + + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length)) + + # inference: with aggregation head (WTQ, WikiSQL-supervised). Model returns logits and aggregation logits + model = TFTapasForQuestionAnswering(config=config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + } + result = model(inputs) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.logits_aggregation.shape, (self.batch_size, self.num_aggregation_labels)) + + # training: can happen in 3 main ways + # case 1: conversational (SQA) + model = TFTapasForQuestionAnswering(config=sqa_config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + "labels": labels, + } + result = model(inputs) + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length)) + + # case 2: weak supervision for aggregation (WTQ) + model = TFTapasForQuestionAnswering(config=config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + "labels": labels, + "numeric_values": numeric_values, + "numeric_values_scale": numeric_values_scale, + "float_answer": float_answer, + } + result = model(inputs) + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.logits_aggregation.shape, (self.batch_size, self.num_aggregation_labels)) + + # case 3: strong supervision for aggregation (WikiSQL-supervised) + wikisql_config = copy.copy(config) + wikisql_config.use_answer_as_supervision = False + model = TFTapasForQuestionAnswering(config=wikisql_config) + inputs = { + "input_ids": input_ids, + "attention_mask": input_mask, + "token_type_ids": token_type_ids, + "labels": labels, + "aggregation_labels": aggregation_labels, + } + result = model(inputs) + self.parent.assertEqual(result.loss.shape, ()) + self.parent.assertEqual(result.logits.shape, (self.batch_size, self.seq_length)) + self.parent.assertEqual(result.logits_aggregation.shape, (self.batch_size, self.num_aggregation_labels)) + + def prepare_config_and_inputs_for_common(self): + config_and_inputs = self.prepare_config_and_inputs() + ( + config, + input_ids, + input_mask, + token_type_ids, + sequence_labels, + token_labels, + labels, + numeric_values, + numeric_values_scale, + float_answer, + aggregation_labels, + ) = config_and_inputs + inputs_dict = {"input_ids": input_ids, "token_type_ids": token_type_ids, "attention_mask": input_mask} + return config, inputs_dict + + +@require_tensorflow_probability +@require_tf +class TFTapasModelTest(TFModelTesterMixin, unittest.TestCase): + + all_model_classes = ( + ( + TFTapasModel, + TFTapasForMaskedLM, + TFTapasForSequenceClassification, + TFTapasForQuestionAnswering, + ) + if is_tf_available() + else () + ) + test_head_masking = False + test_onnx = False + + def _prepare_for_class(self, inputs_dict, model_class, return_labels=False) -> dict: + inputs_dict = copy.deepcopy(inputs_dict) + + if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING): + inputs_dict = { + k: tf.tile(tf.expand_dims(v, 1), (1, self.model_tester.num_choices) + (1,) * (v.ndim - 1)) + if isinstance(v, tf.Tensor) and v.ndim > 0 + else v + for k, v in inputs_dict.items() + } + + if return_labels: + if model_class in get_values(TF_MODEL_FOR_MULTIPLE_CHOICE_MAPPING): + inputs_dict["labels"] = tf.ones(self.model_tester.batch_size, dtype=tf.int32) + elif model_class in get_values(TF_MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING): + inputs_dict["labels"] = tf.zeros( + (self.model_tester.batch_size, self.model_tester.seq_length), dtype=tf.int32 + ) + inputs_dict["aggregation_labels"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32) + inputs_dict["numeric_values"] = tf.zeros( + (self.model_tester.batch_size, self.model_tester.seq_length), dtype=tf.float32 + ) + inputs_dict["numeric_values_scale"] = tf.zeros( + (self.model_tester.batch_size, self.model_tester.seq_length), dtype=tf.float32 + ) + inputs_dict["float_answer"] = tf.zeros(self.model_tester.batch_size, dtype=tf.float32) + elif model_class in get_values(TF_MODEL_FOR_SEQUENCE_CLASSIFICATION_MAPPING): + inputs_dict["labels"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32) + elif model_class in get_values(TF_MODEL_FOR_NEXT_SENTENCE_PREDICTION_MAPPING): + inputs_dict["next_sentence_label"] = tf.zeros(self.model_tester.batch_size, dtype=tf.int32) + elif model_class in [ + *get_values(TF_MODEL_FOR_TOKEN_CLASSIFICATION_MAPPING), + *get_values(TF_MODEL_FOR_CAUSAL_LM_MAPPING), + *get_values(TF_MODEL_FOR_MASKED_LM_MAPPING), + *get_values(TF_MODEL_FOR_PRETRAINING_MAPPING), + *get_values(TF_MODEL_FOR_SEQ_TO_SEQ_CAUSAL_LM_MAPPING), + ]: + inputs_dict["labels"] = tf.zeros( + (self.model_tester.batch_size, self.model_tester.seq_length), dtype=tf.int32 + ) + return inputs_dict + + def setUp(self): + self.model_tester = TFTapasModelTester(self) + self.config_tester = ConfigTester(self, config_class=TapasConfig, hidden_size=37) + + def test_config(self): + self.config_tester.run_common_tests() + + def test_model(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_model(*config_and_inputs) + + def test_for_masked_lm(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_masked_lm(*config_and_inputs) + + def test_for_question_answering(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_question_answering(*config_and_inputs) + + def test_for_sequence_classification(self): + config_and_inputs = self.model_tester.prepare_config_and_inputs() + self.model_tester.create_and_check_for_sequence_classification(*config_and_inputs) + + +def prepare_tapas_single_inputs_for_inference(): + # Here we prepare a single table-question pair to test TAPAS inference on: + data = { + "Footballer": ["Lionel Messi", "Cristiano Ronaldo"], + "Age": ["33", "35"], + } + queries = "Which footballer is 33 years old?" + table = pd.DataFrame.from_dict(data) + + return table, queries + + +def prepare_tapas_batch_inputs_for_inference(): + # Here we prepare a batch of 2 table-question pairs to test TAPAS inference on: + data = { + "Footballer": ["Lionel Messi", "Cristiano Ronaldo"], + "Age": ["33", "35"], + "Number of goals": ["712", "750"], + } + queries = ["Which footballer is 33 years old?", "How many goals does Ronaldo have?"] + table = pd.DataFrame.from_dict(data) + + return table, queries + + +def prepare_tapas_batch_inputs_for_training(): + # Here we prepare a DIFFERENT batch of 2 table-question pairs to test TAPAS training on: + data = { + "Footballer": ["Lionel Messi", "Cristiano Ronaldo"], + "Age": ["33", "35"], + "Number of goals": ["712", "750"], + } + queries = ["Which footballer is 33 years old?", "What's the total number of goals?"] + table = pd.DataFrame.from_dict(data) + + answer_coordinates = [[(0, 0)], [(0, 2), (1, 2)]] + answer_text = [["Lionel Messi"], ["1462"]] + float_answer = [float("NaN"), float("1462")] + + return table, queries, answer_coordinates, answer_text, float_answer + + +@require_tensorflow_probability +@require_tf +class TFTapasModelIntegrationTest(unittest.TestCase): + @cached_property + def default_tokenizer(self): + return TapasTokenizer.from_pretrained("google/tapas-base-finetuned-wtq") + + @slow + def test_inference_no_head(self): + # ideally we want to test this with the weights of tapas_inter_masklm_base_reset, + # but since it's not straightforward to do this with the TF 1 implementation, we test it with + # the weights of the WTQ base model (i.e. tapas_wtq_wikisql_sqa_inter_masklm_base_reset) + model = TFTapasModel.from_pretrained("google/tapas-base-finetuned-wtq") + tokenizer = self.default_tokenizer + table, queries = prepare_tapas_single_inputs_for_inference() + inputs = tokenizer(table=table, queries=queries, return_tensors="tf") + outputs = model(**inputs) + + # test the sequence output + expected_slice = tf.constant( + [ + [ + [-0.141581565, -0.599805772, 0.747186482], + [-0.143664181, -0.602008104, 0.749218345], + [-0.15169853, -0.603363097, 0.741370678], + ] + ] + ) + tf.debugging.assert_near(outputs.last_hidden_state[:, :3, :3], expected_slice, atol=0.0005) + + # test the pooled output + expected_slice = tf.constant([[0.987518311, -0.970520139, -0.994303405]]) + + tf.debugging.assert_near(outputs.pooler_output[:, :3], expected_slice, atol=0.0005) + + @unittest.skip(reason="Model not available yet") + def test_inference_masked_lm(self): + pass + + # TapasForQuestionAnswering has 3 possible ways of being fine-tuned: + # - conversational set-up (SQA) + # - weak supervision for aggregation (WTQ, WikiSQL) + # - strong supervision for aggregation (WikiSQL-supervised) + # We test all of them: + @slow + def test_inference_question_answering_head_conversational(self): + # note that google/tapas-base-finetuned-sqa should correspond to tapas_sqa_inter_masklm_base_reset + model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-sqa") + tokenizer = self.default_tokenizer + table, queries = prepare_tapas_single_inputs_for_inference() + inputs = tokenizer(table=table, queries=queries, return_tensors="tf") + outputs = model(**inputs) + + # test the logits + logits = outputs.logits + expected_shape = tf.TensorShape([1, 21]) + tf.debugging.assert_equal(logits.shape, expected_shape) + + expected_slice = tf.constant( + [ + [ + -9997.274, + -9997.274, + -9997.274, + -9997.274, + -9997.274, + -9997.274, + -9997.274, + -9997.274, + -9997.274, + -16.262585, + -10004.089, + 15.435196, + 15.435196, + 15.435196, + -9990.443, + -16.327433, + -16.327433, + -16.327433, + -16.327433, + -16.327433, + -10004.84, + ] + ] + ) + + tf.debugging.assert_near(logits, expected_slice, atol=0.015) + + @slow + def test_inference_question_answering_head_conversational_absolute_embeddings(self): + # note that google/tapas-small-finetuned-sqa should correspond to tapas_sqa_inter_masklm_small_reset + # however here we test the version with absolute position embeddings + model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-small-finetuned-sqa") + tokenizer = self.default_tokenizer + table, queries = prepare_tapas_single_inputs_for_inference() + inputs = tokenizer(table=table, queries=queries, return_tensors="tf") + outputs = model(**inputs) + + # test the logits + logits = outputs.logits + expected_shape = tf.TensorShape([1, 21]) + tf.debugging.assert_equal(logits.shape, expected_shape) + + expected_slice = tf.constant( + [ + [ + -10000.041, + -10000.041, + -10000.041, + -10000.041, + -10000.041, + -10000.041, + -10000.041, + -10000.041, + -10000.041, + -18.369339, + -10014.692, + 17.730324, + 17.730324, + 17.730324, + -9984.974, + -18.322773, + -18.322773, + -18.322773, + -18.322773, + -18.322773, + -10007.267, + ] + ] + ) + + tf.debugging.assert_near(logits, expected_slice, atol=0.01) + + @slow + def test_inference_question_answering_head_weak_supervision(self): + # note that google/tapas-base-finetuned-wtq should correspond to tapas_wtq_wikisql_sqa_inter_masklm_base_reset + model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-wtq") + + tokenizer = self.default_tokenizer + # let's test on a batch + table, queries = prepare_tapas_batch_inputs_for_inference() + inputs = tokenizer(table=table, queries=queries, padding="longest", return_tensors="tf") + outputs = model(**inputs) + + # test the logits + logits = outputs.logits + expected_shape = tf.TensorShape([2, 28]) + tf.debugging.assert_equal(logits.shape, expected_shape) + + expected_slice = tf.constant( + [ + [-160.375504, -160.375504, -160.375504, -10072.3965, -10070.9414, -10094.9736], + [-9861.6123, -9861.6123, -9861.6123, -9861.6123, -9891.01172, 146.600677], + ] + ) + + tf.debugging.assert_near(logits[:, -6:], expected_slice, atol=0.4) + + # test the aggregation logits + logits_aggregation = outputs.logits_aggregation + expected_shape = tf.TensorShape([2, 4]) + tf.debugging.assert_equal(logits_aggregation.shape, expected_shape) + expected_tensor = tf.constant( + [[18.8545208, -9.76614857, -6.3128891, -2.93525243], [-4.05782509, 40.0351, -5.35329962, 23.3978653]] + ) + tf.debugging.assert_near(logits_aggregation, expected_tensor, atol=0.001) + + # test the predicted answer coordinates and aggregation indices + EXPECTED_PREDICTED_ANSWER_COORDINATES = [[(0, 0)], [(1, 2)]] + EXPECTED_PREDICTED_AGGREGATION_INDICES = [0, 1] + + predicted_answer_coordinates, predicted_aggregation_indices = tokenizer.convert_logits_to_predictions( + inputs, outputs.logits, outputs.logits_aggregation + ) + tf.debugging.assert_equal(EXPECTED_PREDICTED_ANSWER_COORDINATES, predicted_answer_coordinates) + tf.debugging.assert_equal(EXPECTED_PREDICTED_AGGREGATION_INDICES, predicted_aggregation_indices) + + @slow + def test_training_question_answering_head_weak_supervision(self): + # note that google/tapas-base-finetuned-wtq should correspond to tapas_wtq_wikisql_sqa_inter_masklm_base_reset + model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-wtq") + tokenizer = self.default_tokenizer + # let's test on a batch + table, queries, answer_coordinates, answer_text, float_answer = prepare_tapas_batch_inputs_for_training() + inputs = tokenizer( + table=table, + queries=queries, + answer_coordinates=answer_coordinates, + answer_text=answer_text, + padding="longest", + return_tensors="tf", + ) + # the answer should be prepared by the user + float_answer = tf.constant(float_answer, dtype=tf.float32) + outputs = model( + input_ids=inputs["input_ids"], + attention_mask=inputs["attention_mask"], + token_type_ids=inputs["token_type_ids"], + labels=inputs["labels"], + numeric_values=inputs["numeric_values"], + numeric_values_scale=inputs["numeric_values_scale"], + float_answer=float_answer, + ) + + # test the loss + loss = outputs.loss + expected_loss = tf.constant(3.3527612686157227e-08) + tf.debugging.assert_near(loss, expected_loss, atol=1e-6) + + # test the logits on the first example + logits = outputs.logits + expected_shape = tf.TensorShape([2, 29]) + tf.debugging.assert_equal(logits.shape, expected_shape) + expected_slice = tf.constant( + [ + -160.0156, + -160.0156, + -160.0156, + -160.0156, + -160.0156, + -10072.2266, + -10070.8896, + -10092.6006, + -10092.6006, + ] + ) + tf.debugging.assert_near(logits[0, -9:], expected_slice, atol=1e-6) + + # test the aggregation logits on the second example + logits_aggregation = outputs.logits_aggregation + expected_shape = tf.TensorShape([2, 4]) + tf.debugging.assert_equal(logits_aggregation.shape, expected_shape) + expected_tensor = tf.constant([-4.0538, 40.0304, -5.3554, 23.3965]) + tf.debugging.assert_near(logits_aggregation[1, -4:], expected_tensor, atol=1e-4) + + @slow + def test_inference_question_answering_head_strong_supervision(self): + # note that google/tapas-base-finetuned-wikisql-supervised should correspond to tapas_wikisql_sqa_inter_masklm_base_reset + model = TFTapasForQuestionAnswering.from_pretrained("google/tapas-base-finetuned-wikisql-supervised") + tokenizer = self.default_tokenizer + + table, queries = prepare_tapas_single_inputs_for_inference() + inputs = tokenizer(table=table, queries=queries, return_tensors="tf") + outputs = model(**inputs) + + # test the logits + logits = outputs.logits + expected_shape = tf.TensorShape([1, 21]) + tf.debugging.assert_equal(logits.shape, expected_shape) + expected_slice = tf.constant( + [ + [ + -10011.1084, + -10011.1084, + -10011.1084, + -10011.1084, + -10011.1084, + -10011.1084, + -10011.1084, + -10011.1084, + -10011.1084, + -18.6185989, + -10008.7969, + 17.6355762, + 17.6355762, + 17.6355762, + -10002.4404, + -18.7111301, + -18.7111301, + -18.7111301, + -18.7111301, + -18.7111301, + -10007.0977, + ] + ] + ) + tf.debugging.assert_near(logits, expected_slice, atol=0.02) + + # test the aggregation logits + logits_aggregation = outputs.logits_aggregation + expected_shape = tf.TensorShape([1, 4]) + tf.debugging.assert_equal(logits_aggregation.shape, expected_shape) + expected_tensor = tf.constant([[16.5659733, -3.06624889, -2.34152961, -0.970244825]]) + tf.debugging.assert_near(logits_aggregation, expected_tensor, atol=0.003) + + @slow + def test_inference_classification_head(self): + # note that google/tapas-base-finetuned-tabfact should correspond to tapas_tabfact_inter_masklm_base_reset + model = TFTapasForSequenceClassification.from_pretrained("google/tapas-base-finetuned-tabfact") + tokenizer = self.default_tokenizer + + table, queries = prepare_tapas_single_inputs_for_inference() + inputs = tokenizer(table=table, queries=queries, return_tensors="tf") + outputs = model(**inputs) + + # test the classification logits + logits = outputs.logits + expected_shape = tf.TensorShape([1, 2]) + tf.debugging.assert_equal(logits.shape, expected_shape) + expected_slice = tf.constant([[0.795137286, 9.5572]]) + tf.debugging.assert_near(logits, expected_slice, atol=0.05) + + +# Below: tests for Tapas utilities which are defined in modeling_tf_tapas.py. +# These are based on segmented_tensor_test.py of the original implementation. +# URL: https://github.com/google-research/tapas/blob/master/tapas/models/segmented_tensor_test.py +@require_tensorflow_probability +class TFTapasUtilsTest(unittest.TestCase): + def _prepare_tables(self): + """Prepares two tables, both with three distinct rows. + The first table has two columns: + 1.0, 2.0 | 3.0 + 2.0, 0.0 | 1.0 + 1.0, 3.0 | 4.0 + The second table has three columns: + 1.0 | 2.0 | 3.0 + 2.0 | 0.0 | 1.0 + 1.0 | 3.0 | 4.0 + Returns: + SegmentedTensors with the tables. + """ + values = tf.constant( + [ + [[1.0, 2.0, 3.0], [2.0, 0.0, 1.0], [1.0, 3.0, 4.0]], + [[1.0, 2.0, 3.0], [2.0, 0.0, 1.0], [1.0, 3.0, 4.0]], + ] + ) + row_index = IndexMap( + indices=[ + [[0, 0, 0], [1, 1, 1], [2, 2, 2]], + [[0, 0, 0], [1, 1, 1], [2, 2, 2]], + ], + num_segments=3, + batch_dims=1, + ) + col_index = IndexMap( + indices=[ + [[0, 0, 1], [0, 0, 1], [0, 0, 1]], + [[0, 1, 2], [0, 1, 2], [0, 1, 2]], + ], + num_segments=3, + batch_dims=1, + ) + return values, row_index, col_index + + def test_product_index(self): + _, row_index, col_index = self._prepare_tables() + cell_index = ProductIndexMap(row_index, col_index) + row_index_proj = cell_index.project_outer(cell_index) + col_index_proj = cell_index.project_inner(cell_index) + + ind = cell_index.indices + self.assertEqual(cell_index.num_segments, 9) + + # Projections should give back the original indices. + # we use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual + np.testing.assert_array_equal(row_index.indices.numpy(), row_index_proj.indices.numpy()) + self.assertEqual(row_index.num_segments, row_index_proj.num_segments) + self.assertEqual(row_index.batch_dims, row_index_proj.batch_dims) + # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual + np.testing.assert_array_equal(col_index.indices.numpy(), col_index_proj.indices.numpy()) + self.assertEqual(col_index.batch_dims, col_index_proj.batch_dims) + + # The first and second "column" are identified in the first table. + for i in range(3): + self.assertEqual(ind[0, i, 0], ind[0, i, 1]) + self.assertNotEqual(ind[0, i, 0], ind[0, i, 2]) + + # All rows are distinct in the first table. + for i, i_2 in zip(range(3), range(3)): + for j, j_2 in zip(range(3), range(3)): + if i != i_2 and j != j_2: + self.assertNotEqual(ind[0, i, j], ind[0, i_2, j_2]) + + # All cells are distinct in the second table. + for i, i_2 in zip(range(3), range(3)): + for j, j_2 in zip(range(3), range(3)): + if i != i_2 or j != j_2: + self.assertNotEqual(ind[1, i, j], ind[1, i_2, j_2]) + + def test_flatten(self): + _, row_index, col_index = self._prepare_tables() + row_index_flat = flatten(row_index) + col_index_flat = flatten(col_index) + + shape = [3, 4, 5] + batched_index = IndexMap(indices=tf.zeros(shape, dtype=tf.int32), num_segments=1, batch_dims=3) + batched_index_flat = flatten(batched_index) + + # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual + np.testing.assert_array_equal( + row_index_flat.indices.numpy(), [0, 0, 0, 1, 1, 1, 2, 2, 2, 3, 3, 3, 4, 4, 4, 5, 5, 5] + ) + np.testing.assert_array_equal( + col_index_flat.indices.numpy(), [0, 0, 1, 0, 0, 1, 0, 0, 1, 3, 4, 5, 3, 4, 5, 3, 4, 5] + ) + self.assertEqual(batched_index_flat.num_segments.numpy(), np.prod(shape)) + np.testing.assert_array_equal(batched_index_flat.indices.numpy(), range(np.prod(shape))) + + def test_range_index_map(self): + batch_shape = [3, 4] + num_segments = 5 + index = range_index_map(batch_shape, num_segments) + + self.assertEqual(num_segments, index.num_segments) + self.assertEqual(2, index.batch_dims) + indices = index.indices + # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual + np.testing.assert_array_equal(list(indices.shape), [3, 4, 5]) + for i in range(batch_shape[0]): + for j in range(batch_shape[1]): + # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual + np.testing.assert_array_equal(indices[i, j, :].numpy(), range(num_segments)) + + def test_reduce_sum(self): + values, row_index, col_index = self._prepare_tables() + cell_index = ProductIndexMap(row_index, col_index) + row_sum, _ = reduce_sum(values, row_index) + col_sum, _ = reduce_sum(values, col_index) + cell_sum, _ = reduce_sum(values, cell_index) + + # We use np.testing.assert_allclose rather than Tensorflow's assertAllClose + np.testing.assert_allclose(row_sum.numpy(), [[6.0, 3.0, 8.0], [6.0, 3.0, 8.0]]) + np.testing.assert_allclose(col_sum.numpy(), [[9.0, 8.0, 0.0], [4.0, 5.0, 8.0]]) + np.testing.assert_allclose( + cell_sum.numpy(), + [[3.0, 3.0, 0.0, 2.0, 1.0, 0.0, 4.0, 4.0, 0.0], [1.0, 2.0, 3.0, 2.0, 0.0, 1.0, 1.0, 3.0, 4.0]], + ) + + def test_reduce_mean(self): + values, row_index, col_index = self._prepare_tables() + cell_index = ProductIndexMap(row_index, col_index) + row_mean, _ = reduce_mean(values, row_index) + col_mean, _ = reduce_mean(values, col_index) + cell_mean, _ = reduce_mean(values, cell_index) + + # We use np.testing.assert_allclose rather than Tensorflow's assertAllClose + np.testing.assert_allclose( + row_mean.numpy(), [[6.0 / 3.0, 3.0 / 3.0, 8.0 / 3.0], [6.0 / 3.0, 3.0 / 3.0, 8.0 / 3.0]] + ) + np.testing.assert_allclose(col_mean.numpy(), [[9.0 / 6.0, 8.0 / 3.0, 0.0], [4.0 / 3.0, 5.0 / 3.0, 8.0 / 3.0]]) + np.testing.assert_allclose( + cell_mean.numpy(), + [ + [3.0 / 2.0, 3.0, 0.0, 2.0 / 2.0, 1.0, 0.0, 4.0 / 2.0, 4.0, 0.0], + [1.0, 2.0, 3.0, 2.0, 0.0, 1.0, 1.0, 3.0, 4.0], + ], + ) + + def test_reduce_max(self): + values = tf.convert_to_tensor([2.0, 1.0, 0.0, 3.0]) + index = IndexMap(indices=tf.convert_to_tensor([0, 1, 0, 1]), num_segments=2) + maximum, _ = reduce_max(values, index) + + # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual + np.testing.assert_array_equal(maximum.numpy(), [2, 3]) + + def test_reduce_sum_vectorized(self): + values = tf.convert_to_tensor([[1.0, 2.0, 3.0], [2.0, 3.0, 4.0], [3.0, 4.0, 5.0]]) + index = IndexMap(indices=tf.convert_to_tensor([0, 0, 1]), num_segments=2, batch_dims=0) + sums, new_index = reduce_sum(values, index) + + # We use np.testing.assert_allclose rather than Tensorflow's assertAllClose + np.testing.assert_allclose(sums.numpy(), [[3.0, 5.0, 7.0], [3.0, 4.0, 5.0]]) + # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual + np.testing.assert_array_equal(new_index.indices.numpy(), [0, 1]) + np.testing.assert_array_equal(new_index.num_segments.numpy(), 2) + np.testing.assert_array_equal(new_index.batch_dims, 0) + + def test_gather(self): + values, row_index, col_index = self._prepare_tables() + cell_index = ProductIndexMap(row_index, col_index) + + # Compute sums and then gather. The result should have the same shape as + # the original table and each element should contain the sum the values in + # its cell. + sums, _ = reduce_sum(values, cell_index) + cell_sum = gather(sums, cell_index) + assert cell_sum.shape == values.shape + + # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual + np.testing.assert_allclose( + cell_sum.numpy(), + [[[3.0, 3.0, 3.0], [2.0, 2.0, 1.0], [4.0, 4.0, 4.0]], [[1.0, 2.0, 3.0], [2.0, 0.0, 1.0], [1.0, 3.0, 4.0]]], + ) + + def test_gather_vectorized(self): + values = tf.constant([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]) + index = IndexMap(indices=tf.convert_to_tensor([[0, 1], [1, 0]]), num_segments=2, batch_dims=1) + result = gather(values, index) + + # We use np.testing.assert_array_equal rather than Tensorflow's assertAllEqual + np.testing.assert_array_equal(result.numpy(), [[[1, 2], [3, 4]], [[7, 8], [5, 6]]]) diff --git a/tests/test_pipelines_table_question_answering.py b/tests/test_pipelines_table_question_answering.py index a319387ee2..789e92c3d7 100644 --- a/tests/test_pipelines_table_question_answering.py +++ b/tests/test_pipelines_table_question_answering.py @@ -19,11 +19,13 @@ from transformers import ( AutoModelForTableQuestionAnswering, AutoTokenizer, TableQuestionAnsweringPipeline, + TFAutoModelForTableQuestionAnswering, pipeline, ) from transformers.testing_utils import ( is_pipeline_test, require_pandas, + require_tensorflow_probability, require_tf, require_torch, require_torch_scatter, @@ -33,6 +35,7 @@ from transformers.testing_utils import ( from .test_pipelines_common import PipelineTestCaseMeta +@require_tensorflow_probability @require_torch_scatter @require_torch @require_pandas @@ -43,9 +46,105 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): model_mapping = MODEL_FOR_TABLE_QUESTION_ANSWERING_MAPPING @require_tf - @unittest.skip("Table question answering not implemented in TF") def test_small_model_tf(self): - pass + model_id = "lysandre/tiny-tapas-random-wtq" + model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id, from_pt=True) + tokenizer = AutoTokenizer.from_pretrained(model_id) + self.assertIsInstance(model.config.aggregation_labels, dict) + self.assertIsInstance(model.config.no_aggregation_label_index, int) + + table_querier = TableQuestionAnsweringPipeline(model=model, tokenizer=tokenizer) + outputs = table_querier( + table={ + "actors": ["brad pitt", "leonardo di caprio", "george clooney"], + "age": ["56", "45", "59"], + "number of movies": ["87", "53", "69"], + "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"], + }, + query="how many movies has george clooney played in?", + ) + self.assertEqual( + outputs, + {"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"}, + ) + outputs = table_querier( + table={ + "actors": ["brad pitt", "leonardo di caprio", "george clooney"], + "age": ["56", "45", "59"], + "number of movies": ["87", "53", "69"], + "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"], + }, + query=["how many movies has george clooney played in?", "how old is he?", "what's his date of birth?"], + ) + self.assertEqual( + outputs, + [ + {"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"}, + {"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"}, + {"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"}, + ], + ) + outputs = table_querier( + table={ + "Repository": ["Transformers", "Datasets", "Tokenizers"], + "Stars": ["36542", "4512", "3934"], + "Contributors": ["651", "77", "34"], + "Programming language": ["Python", "Python", "Rust, Python and NodeJS"], + }, + query=[ + "What repository has the largest number of stars?", + "Given that the numbers of stars defines if a repository is active, what repository is the most active?", + "What is the number of repositories?", + "What is the average number of stars?", + "What is the total amount of stars?", + ], + ) + self.assertEqual( + outputs, + [ + {"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"}, + {"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"}, + {"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"}, + {"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"}, + {"answer": "AVERAGE > ", "coordinates": [], "cells": [], "aggregator": "AVERAGE"}, + ], + ) + + with self.assertRaises(ValueError): + table_querier(query="What does it do with empty context ?", table=None) + with self.assertRaises(ValueError): + table_querier(query="What does it do with empty context ?", table="") + with self.assertRaises(ValueError): + table_querier(query="What does it do with empty context ?", table={}) + with self.assertRaises(ValueError): + table_querier( + table={ + "Repository": ["Transformers", "Datasets", "Tokenizers"], + "Stars": ["36542", "4512", "3934"], + "Contributors": ["651", "77", "34"], + "Programming language": ["Python", "Python", "Rust, Python and NodeJS"], + } + ) + with self.assertRaises(ValueError): + table_querier( + query="", + table={ + "Repository": ["Transformers", "Datasets", "Tokenizers"], + "Stars": ["36542", "4512", "3934"], + "Contributors": ["651", "77", "34"], + "Programming language": ["Python", "Python", "Rust, Python and NodeJS"], + }, + ) + with self.assertRaises(ValueError): + table_querier( + query=None, + table={ + "Repository": ["Transformers", "Datasets", "Tokenizers"], + "Stars": ["36542", "4512", "3934"], + "Contributors": ["651", "77", "34"], + "Programming language": ["Python", "Python", "Rust, Python and NodeJS"], + }, + ) @require_torch def test_small_model_pt(self): @@ -148,7 +247,8 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): }, ) - def test_slow_tokenizer_sqa(self): + @require_torch + def test_slow_tokenizer_sqa_pt(self): model_id = "lysandre/tiny-tapas-random-sqa" model = AutoModelForTableQuestionAnswering.from_pretrained(model_id) tokenizer = AutoTokenizer.from_pretrained(model_id) @@ -265,8 +365,126 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): }, ) + @require_tf + def test_slow_tokenizer_sqa_tf(self): + model_id = "lysandre/tiny-tapas-random-sqa" + model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id, from_pt=True) + tokenizer = AutoTokenizer.from_pretrained(model_id) + table_querier = TableQuestionAnsweringPipeline(model=model, tokenizer=tokenizer) + + inputs = { + "table": { + "actors": ["brad pitt", "leonardo di caprio", "george clooney"], + "age": ["56", "45", "59"], + "number of movies": ["87", "53", "69"], + "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"], + }, + "query": ["how many movies has george clooney played in?", "how old is he?", "what's his date of birth?"], + } + sequential_outputs = table_querier(**inputs, sequential=True) + batch_outputs = table_querier(**inputs, sequential=False) + + self.assertEqual(len(sequential_outputs), 3) + self.assertEqual(len(batch_outputs), 3) + self.assertEqual(sequential_outputs[0], batch_outputs[0]) + self.assertNotEqual(sequential_outputs[1], batch_outputs[1]) + # self.assertNotEqual(sequential_outputs[2], batch_outputs[2]) + + table_querier = TableQuestionAnsweringPipeline(model=model, tokenizer=tokenizer) + outputs = table_querier( + table={ + "actors": ["brad pitt", "leonardo di caprio", "george clooney"], + "age": ["56", "45", "59"], + "number of movies": ["87", "53", "69"], + "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"], + }, + query="how many movies has george clooney played in?", + ) + self.assertEqual( + outputs, + {"answer": "7 february 1967", "coordinates": [(0, 3)], "cells": ["7 february 1967"]}, + ) + outputs = table_querier( + table={ + "actors": ["brad pitt", "leonardo di caprio", "george clooney"], + "age": ["56", "45", "59"], + "number of movies": ["87", "53", "69"], + "date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"], + }, + query=["how many movies has george clooney played in?", "how old is he?", "what's his date of birth?"], + ) + self.assertEqual( + outputs, + [ + {"answer": "7 february 1967", "coordinates": [(0, 3)], "cells": ["7 february 1967"]}, + {"answer": "7 february 1967", "coordinates": [(0, 3)], "cells": ["7 february 1967"]}, + {"answer": "7 february 1967", "coordinates": [(0, 3)], "cells": ["7 february 1967"]}, + ], + ) + outputs = table_querier( + table={ + "Repository": ["Transformers", "Datasets", "Tokenizers"], + "Stars": ["36542", "4512", "3934"], + "Contributors": ["651", "77", "34"], + "Programming language": ["Python", "Python", "Rust, Python and NodeJS"], + }, + query=[ + "What repository has the largest number of stars?", + "Given that the numbers of stars defines if a repository is active, what repository is the most active?", + "What is the number of repositories?", + "What is the average number of stars?", + "What is the total amount of stars?", + ], + ) + self.assertEqual( + outputs, + [ + {"answer": "Python, Python", "coordinates": [(0, 3), (1, 3)], "cells": ["Python", "Python"]}, + {"answer": "Python, Python", "coordinates": [(0, 3), (1, 3)], "cells": ["Python", "Python"]}, + {"answer": "Python, Python", "coordinates": [(0, 3), (1, 3)], "cells": ["Python", "Python"]}, + {"answer": "Python, Python", "coordinates": [(0, 3), (1, 3)], "cells": ["Python", "Python"]}, + {"answer": "Python, Python", "coordinates": [(0, 3), (1, 3)], "cells": ["Python", "Python"]}, + ], + ) + + with self.assertRaises(ValueError): + table_querier(query="What does it do with empty context ?", table=None) + with self.assertRaises(ValueError): + table_querier(query="What does it do with empty context ?", table="") + with self.assertRaises(ValueError): + table_querier(query="What does it do with empty context ?", table={}) + with self.assertRaises(ValueError): + table_querier( + table={ + "Repository": ["Transformers", "Datasets", "Tokenizers"], + "Stars": ["36542", "4512", "3934"], + "Contributors": ["651", "77", "34"], + "Programming language": ["Python", "Python", "Rust, Python and NodeJS"], + } + ) + with self.assertRaises(ValueError): + table_querier( + query="", + table={ + "Repository": ["Transformers", "Datasets", "Tokenizers"], + "Stars": ["36542", "4512", "3934"], + "Contributors": ["651", "77", "34"], + "Programming language": ["Python", "Python", "Rust, Python and NodeJS"], + }, + ) + with self.assertRaises(ValueError): + table_querier( + query=None, + table={ + "Repository": ["Transformers", "Datasets", "Tokenizers"], + "Stars": ["36542", "4512", "3934"], + "Contributors": ["651", "77", "34"], + "Programming language": ["Python", "Python", "Rust, Python and NodeJS"], + }, + ) + @slow - def test_integration_wtq(self): + def test_integration_wtq_pt(self): table_querier = pipeline("table-question-answering") data = { @@ -310,7 +528,54 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): self.assertListEqual(results, expected_results) @slow - def test_integration_sqa(self): + def test_integration_wtq_tf(self): + model_id = "google/tapas-base-finetuned-wtq" + model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id) + tokenizer = AutoTokenizer.from_pretrained(model_id) + table_querier = pipeline("table-question-answering", model=model, tokenizer=tokenizer) + + data = { + "Repository": ["Transformers", "Datasets", "Tokenizers"], + "Stars": ["36542", "4512", "3934"], + "Contributors": ["651", "77", "34"], + "Programming language": ["Python", "Python", "Rust, Python and NodeJS"], + } + queries = [ + "What repository has the largest number of stars?", + "Given that the numbers of stars defines if a repository is active, what repository is the most active?", + "What is the number of repositories?", + "What is the average number of stars?", + "What is the total amount of stars?", + ] + + results = table_querier(data, queries) + + expected_results = [ + {"answer": "Transformers", "coordinates": [(0, 0)], "cells": ["Transformers"], "aggregator": "NONE"}, + {"answer": "Transformers", "coordinates": [(0, 0)], "cells": ["Transformers"], "aggregator": "NONE"}, + { + "answer": "COUNT > Transformers, Datasets, Tokenizers", + "coordinates": [(0, 0), (1, 0), (2, 0)], + "cells": ["Transformers", "Datasets", "Tokenizers"], + "aggregator": "COUNT", + }, + { + "answer": "AVERAGE > 36542, 4512, 3934", + "coordinates": [(0, 1), (1, 1), (2, 1)], + "cells": ["36542", "4512", "3934"], + "aggregator": "AVERAGE", + }, + { + "answer": "SUM > 36542, 4512, 3934", + "coordinates": [(0, 1), (1, 1), (2, 1)], + "cells": ["36542", "4512", "3934"], + "aggregator": "SUM", + }, + ] + self.assertListEqual(results, expected_results) + + @slow + def test_integration_sqa_pt(self): table_querier = pipeline( "table-question-answering", model="google/tapas-base-finetuned-sqa", @@ -331,3 +596,29 @@ class TQAPipelineTests(unittest.TestCase, metaclass=PipelineTestCaseMeta): {"answer": "28 november 1967", "coordinates": [(2, 3)], "cells": ["28 november 1967"]}, ] self.assertListEqual(results, expected_results) + + @slow + def test_integration_sqa_tf(self): + model_id = "google/tapas-base-finetuned-sqa" + model = TFAutoModelForTableQuestionAnswering.from_pretrained(model_id) + tokenizer = AutoTokenizer.from_pretrained(model_id) + table_querier = pipeline( + "table-question-answering", + model=model, + tokenizer=tokenizer, + ) + data = { + "Actors": ["Brad Pitt", "Leonardo Di Caprio", "George Clooney"], + "Age": ["56", "45", "59"], + "Number of movies": ["87", "53", "69"], + "Date of birth": ["7 february 1967", "10 june 1996", "28 november 1967"], + } + queries = ["How many movies has George Clooney played in?", "How old is he?", "What's his date of birth?"] + results = table_querier(data, queries, sequential=True) + + expected_results = [ + {"answer": "69", "coordinates": [(2, 2)], "cells": ["69"]}, + {"answer": "59", "coordinates": [(2, 1)], "cells": ["59"]}, + {"answer": "28 november 1967", "coordinates": [(2, 3)], "cells": ["28 november 1967"]}, + ] + self.assertListEqual(results, expected_results)